sv-dependency-builds: src/fftw-3.3.5/kernel/ifftw.h annotate

annotate src/fftw-3.3.5/kernel/ifftw.h @ 169:223a55898ab9 tip default

Add null config files

author	Chris Cannam <cannam@all-day-breakfast.com>
date	Mon, 02 Mar 2020 14:03:47 +0000
parents	7867fa7e1b6b
children

rev	line source
cannam@127	1 /*
cannam@127	2 * Copyright (c) 2003, 2007-14 Matteo Frigo
cannam@127	3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
cannam@127	4 *
cannam@127	5 * This program is free software; you can redistribute it and/or modify
cannam@127	6 * it under the terms of the GNU General Public License as published by
cannam@127	7 * the Free Software Foundation; either version 2 of the License, or
cannam@127	8 * (at your option) any later version.
cannam@127	9 *
cannam@127	10 * This program is distributed in the hope that it will be useful,
cannam@127	11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
cannam@127	12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
cannam@127	13 * GNU General Public License for more details.
cannam@127	14 *
cannam@127	15 * You should have received a copy of the GNU General Public License
cannam@127	16 * along with this program; if not, write to the Free Software
cannam@127	17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
cannam@127	18 *
cannam@127	19 */
cannam@127	20
cannam@127	21
cannam@127	22 /* FFTW internal header file */
cannam@127	23 #ifndef __IFFTW_H__
cannam@127	24 #define __IFFTW_H__
cannam@127	25
cannam@127	26 #include "config.h"
cannam@127	27
cannam@127	28 #include <stdlib.h> /* size_t */
cannam@127	29 #include <stdarg.h> /* va_list */
cannam@127	30 #include <stddef.h> /* ptrdiff_t */
cannam@127	31 #include <limits.h> /* INT_MAX */
cannam@127	32
cannam@127	33 #if HAVE_SYS_TYPES_H
cannam@127	34 # include <sys/types.h>
cannam@127	35 #endif
cannam@127	36
cannam@127	37 #if HAVE_STDINT_H
cannam@127	38 # include <stdint.h> /* uintptr_t, maybe */
cannam@127	39 #endif
cannam@127	40
cannam@127	41 #if HAVE_INTTYPES_H
cannam@127	42 # include <inttypes.h> /* uintptr_t, maybe */
cannam@127	43 #endif
cannam@127	44
cannam@127	45 #ifdef __cplusplus
cannam@127	46 extern "C"
cannam@127	47 {
cannam@127	48 #endif /* __cplusplus */
cannam@127	49
cannam@127	50 /* Windows annoyances -- since tests/hook.c uses some internal
cannam@127	51 FFTW functions, we need to given them the dllexport attribute
cannam@127	52 under Windows when compiling as a DLL (see api/fftw3.h). */
cannam@127	53 #if defined(FFTW_EXTERN)
cannam@127	54 # define IFFTW_EXTERN FFTW_EXTERN
cannam@127	55 #elif (defined(FFTW_DLL) \|\| defined(DLL_EXPORT)) \
cannam@127	56 && (defined(_WIN32) \|\| defined(__WIN32__))
cannam@127	57 # define IFFTW_EXTERN extern __declspec(dllexport)
cannam@127	58 #else
cannam@127	59 # define IFFTW_EXTERN extern
cannam@127	60 #endif
cannam@127	61
cannam@127	62 /* determine precision and name-mangling scheme */
cannam@127	63 #define CONCAT(prefix, name) prefix ## name
cannam@127	64 #if defined(FFTW_SINGLE)
cannam@127	65 typedef float R;
cannam@127	66 # define X(name) CONCAT(fftwf_, name)
cannam@127	67 #elif defined(FFTW_LDOUBLE)
cannam@127	68 typedef long double R;
cannam@127	69 # define X(name) CONCAT(fftwl_, name)
cannam@127	70 # define TRIGREAL_IS_LONG_DOUBLE
cannam@127	71 #elif defined(FFTW_QUAD)
cannam@127	72 typedef __float128 R;
cannam@127	73 # define X(name) CONCAT(fftwq_, name)
cannam@127	74 # define TRIGREAL_IS_QUAD
cannam@127	75 #else
cannam@127	76 typedef double R;
cannam@127	77 # define X(name) CONCAT(fftw_, name)
cannam@127	78 #endif
cannam@127	79
cannam@127	80 /*
cannam@127	81 integral type large enough to contain a stride (what ``int'' should
cannam@127	82 have been in the first place.
cannam@127	83 */
cannam@127	84 typedef ptrdiff_t INT;
cannam@127	85
cannam@127	86 /* dummy use of unused parameters to silence compiler warnings */
cannam@127	87 #define UNUSED(x) (void)x
cannam@127	88
cannam@127	89 #define NELEM(array) ((sizeof(array) / sizeof((array)[0])))
cannam@127	90
cannam@127	91 #define FFT_SIGN (-1) /* sign convention for forward transforms */
cannam@127	92 extern void X(extract_reim)(int sign, R c, R r, R *i);
cannam@127	93
cannam@127	94 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
cannam@127	95
cannam@127	96 #define STRINGIZEx(x) #x
cannam@127	97 #define STRINGIZE(x) STRINGIZEx(x)
cannam@127	98 #define CIMPLIES(ante, post) (!(ante) \|\| (post))
cannam@127	99
cannam@127	100 /* define HAVE_SIMD if any simd extensions are supported */
cannam@127	101 #if defined(HAVE_SSE) \|\| defined(HAVE_SSE2) \|\| \
cannam@127	102 defined(HAVE_AVX) \|\| defined(HAVE_AVX_128_FMA) \|\| \
cannam@127	103 defined(HAVE_AVX2) \|\| defined(HAVE_AVX512) \|\| \
cannam@127	104 defined(HAVE_KCVI) \|\| \
cannam@127	105 defined(HAVE_ALTIVEC) \|\| defined(HAVE_VSX) \|\| \
cannam@127	106 defined(HAVE_MIPS_PS) \|\| \
cannam@127	107 defined(HAVE_GENERIC_SIMD128) \|\| defined(HAVE_GENERIC_SIMD256)
cannam@127	108 #define HAVE_SIMD 1
cannam@127	109 #else
cannam@127	110 #define HAVE_SIMD 0
cannam@127	111 #endif
cannam@127	112
cannam@127	113 extern int X(have_simd_sse2)(void);
cannam@127	114 extern int X(have_simd_avx)(void);
cannam@127	115 extern int X(have_simd_avx_128_fma)(void);
cannam@127	116 extern int X(have_simd_avx2)(void);
cannam@127	117 extern int X(have_simd_avx2_128)(void);
cannam@127	118 extern int X(have_simd_avx512)(void);
cannam@127	119 extern int X(have_simd_altivec)(void);
cannam@127	120 extern int X(have_simd_vsx)(void);
cannam@127	121 extern int X(have_simd_neon)(void);
cannam@127	122
cannam@127	123 /* forward declarations */
cannam@127	124 typedef struct problem_s problem;
cannam@127	125 typedef struct plan_s plan;
cannam@127	126 typedef struct solver_s solver;
cannam@127	127 typedef struct planner_s planner;
cannam@127	128 typedef struct printer_s printer;
cannam@127	129 typedef struct scanner_s scanner;
cannam@127	130
cannam@127	131 /-----------------------------------------------------------------------/
cannam@127	132 /* alloca: */
cannam@127	133 #if HAVE_SIMD
cannam@127	134 # if defined(HAVE_KCVI) \|\| defined(HAVE_AVX512)
cannam@127	135 # define MIN_ALIGNMENT 64
cannam@127	136 # elif defined(HAVE_AVX) \|\| defined(HAVE_AVX2) \|\| defined(HAVE_GENERIC_SIMD256)
cannam@127	137 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
cannam@127	138 * everything else */
cannam@127	139 # else
cannam@127	140 /* Note that we cannot use 32-byte alignment for all SIMD. For
cannam@127	141 example, MacOS X malloc is 16-byte aligned, but there was no
cannam@127	142 posix_memalign in MacOS X until version 10.6. */
cannam@127	143 # define MIN_ALIGNMENT 16
cannam@127	144 # endif
cannam@127	145 #endif
cannam@127	146
cannam@127	147 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
cannam@127	148 /* use alloca if available */
cannam@127	149
cannam@127	150 #ifndef alloca
cannam@127	151 #ifdef __GNUC__
cannam@127	152 # define alloca __builtin_alloca
cannam@127	153 #else
cannam@127	154 # ifdef _MSC_VER
cannam@127	155 # include <malloc.h>
cannam@127	156 # define alloca _alloca
cannam@127	157 # else
cannam@127	158 # if HAVE_ALLOCA_H
cannam@127	159 # include <alloca.h>
cannam@127	160 # else
cannam@127	161 # ifdef _AIX
cannam@127	162 #pragma alloca
cannam@127	163 # else
cannam@127	164 # ifndef alloca /* predefined by HP cc +Olibcalls */
cannam@127	165 void *alloca(size_t);
cannam@127	166 # endif
cannam@127	167 # endif
cannam@127	168 # endif
cannam@127	169 # endif
cannam@127	170 #endif
cannam@127	171 #endif
cannam@127	172
cannam@127	173 # ifdef MIN_ALIGNMENT
cannam@127	174 # define STACK_MALLOC(T, p, n) \
cannam@127	175 { \
cannam@127	176 p = (T)alloca((n) + MIN_ALIGNMENT); \
cannam@127	177 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
cannam@127	178 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \
cannam@127	179 }
cannam@127	180 # define STACK_FREE(n)
cannam@127	181 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
cannam@127	182 # define STACK_MALLOC(T, p, n) p = (T)alloca(n)
cannam@127	183 # define STACK_FREE(n)
cannam@127	184 # endif
cannam@127	185
cannam@127	186 #else /* ! HAVE_ALLOCA */
cannam@127	187 /* use malloc instead of alloca */
cannam@127	188 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
cannam@127	189 # define STACK_FREE(n) X(ifree)(n)
cannam@127	190 #endif /* ! HAVE_ALLOCA */
cannam@127	191
cannam@127	192 /* allocation of buffers. If these grow too large use malloc(), else
cannam@127	193 use STACK_MALLOC (hopefully reducing to alloca()). */
cannam@127	194
cannam@127	195 /* 64KiB ought to be enough for anybody */
cannam@127	196 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
cannam@127	197
cannam@127	198 #define BUF_ALLOC(T, p, n) \
cannam@127	199 { \
cannam@127	200 if (n < MAX_STACK_ALLOC) { \
cannam@127	201 STACK_MALLOC(T, p, n); \
cannam@127	202 } else { \
cannam@127	203 p = (T)MALLOC(n, BUFFERS); \
cannam@127	204 } \
cannam@127	205 }
cannam@127	206
cannam@127	207 #define BUF_FREE(p, n) \
cannam@127	208 { \
cannam@127	209 if (n < MAX_STACK_ALLOC) { \
cannam@127	210 STACK_FREE(p); \
cannam@127	211 } else { \
cannam@127	212 X(ifree)(p); \
cannam@127	213 } \
cannam@127	214 }
cannam@127	215
cannam@127	216 /-----------------------------------------------------------------------/
cannam@127	217 /* define uintptr_t if it is not already defined */
cannam@127	218
cannam@127	219 #ifndef HAVE_UINTPTR_T
cannam@127	220 # if SIZEOF_VOID_P == 0
cannam@127	221 # error sizeof void* is unknown!
cannam@127	222 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
cannam@127	223 typedef unsigned int uintptr_t;
cannam@127	224 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
cannam@127	225 typedef unsigned long uintptr_t;
cannam@127	226 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
cannam@127	227 typedef unsigned long long uintptr_t;
cannam@127	228 # else
cannam@127	229 # error no unsigned integer type matches void* sizeof!
cannam@127	230 # endif
cannam@127	231 #endif
cannam@127	232
cannam@127	233 /-----------------------------------------------------------------------/
cannam@127	234 /* We can do an optimization for copying pairs of (aligned) floats
cannam@127	235 when in single precision if 2float = double. /
cannam@127	236
cannam@127	237 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
cannam@127	238 && SIZEOF_FLOAT != 0 \
cannam@127	239 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
cannam@127	240
cannam@127	241 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
cannam@127	242
cannam@127	243 /-----------------------------------------------------------------------/
cannam@127	244 /* assert.c: */
cannam@127	245 IFFTW_EXTERN void X(assertion_failed)(const char *s,
cannam@127	246 int line, const char *file);
cannam@127	247
cannam@127	248 /* always check */
cannam@127	249 #define CK(ex) \
cannam@127	250 (void)((ex) \|\| (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
cannam@127	251
cannam@127	252 #ifdef FFTW_DEBUG
cannam@127	253 /* check only if debug enabled */
cannam@127	254 #define A(ex) \
cannam@127	255 (void)((ex) \|\| (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
cannam@127	256 #else
cannam@127	257 #define A(ex) /* nothing */
cannam@127	258 #endif
cannam@127	259
cannam@127	260 extern void X(debug)(const char *format, ...);
cannam@127	261 #define D X(debug)
cannam@127	262
cannam@127	263 /-----------------------------------------------------------------------/
cannam@127	264 /* kalloc.c: */
cannam@127	265 extern void *X(kernel_malloc)(size_t n);
cannam@127	266 extern void X(kernel_free)(void *p);
cannam@127	267
cannam@127	268 /-----------------------------------------------------------------------/
cannam@127	269 /* alloc.c: */
cannam@127	270
cannam@127	271 /* objects allocated by malloc, for statistical purposes */
cannam@127	272 enum malloc_tag {
cannam@127	273 EVERYTHING,
cannam@127	274 PLANS,
cannam@127	275 SOLVERS,
cannam@127	276 PROBLEMS,
cannam@127	277 BUFFERS,
cannam@127	278 HASHT,
cannam@127	279 TENSORS,
cannam@127	280 PLANNERS,
cannam@127	281 SLVDESCS,
cannam@127	282 TWIDDLES,
cannam@127	283 STRIDES,
cannam@127	284 OTHER,
cannam@127	285 MALLOC_WHAT_LAST /* must be last */
cannam@127	286 };
cannam@127	287
cannam@127	288 IFFTW_EXTERN void X(ifree)(void *ptr);
cannam@127	289 extern void X(ifree0)(void *ptr);
cannam@127	290
cannam@127	291 #ifdef FFTW_DEBUG_MALLOC
cannam@127	292
cannam@127	293 IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what,
cannam@127	294 const char *file, int line);
cannam@127	295 #define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
cannam@127	296 IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose);
cannam@127	297
cannam@127	298 #else /* ! FFTW_DEBUG_MALLOC */
cannam@127	299
cannam@127	300 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
cannam@127	301 #define MALLOC(n, what) X(malloc_plain)(n)
cannam@127	302
cannam@127	303 #endif
cannam@127	304
cannam@127	305 #if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) \|\| defined(HAVE_OPENMP))
cannam@127	306 extern int X(in_thread);
cannam@127	307 # define IN_THREAD X(in_thread)
cannam@127	308 # define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
cannam@127	309 # define THREAD_OFF X(in_thread) = in_thread_save; }
cannam@127	310 #else
cannam@127	311 # define IN_THREAD 0
cannam@127	312 # define THREAD_ON
cannam@127	313 # define THREAD_OFF
cannam@127	314 #endif
cannam@127	315
cannam@127	316 /-----------------------------------------------------------------------/
cannam@127	317 /* low-resolution clock */
cannam@127	318
cannam@127	319 #ifdef FAKE_CRUDE_TIME
cannam@127	320 typedef int crude_time;
cannam@127	321 #else
cannam@127	322 # if TIME_WITH_SYS_TIME
cannam@127	323 # include <sys/time.h>
cannam@127	324 # include <time.h>
cannam@127	325 # else
cannam@127	326 # if HAVE_SYS_TIME_H
cannam@127	327 # include <sys/time.h>
cannam@127	328 # else
cannam@127	329 # include <time.h>
cannam@127	330 # endif
cannam@127	331 # endif
cannam@127	332
cannam@127	333 # ifdef HAVE_BSDGETTIMEOFDAY
cannam@127	334 # ifndef HAVE_GETTIMEOFDAY
cannam@127	335 # define gettimeofday BSDgettimeofday
cannam@127	336 # define HAVE_GETTIMEOFDAY 1
cannam@127	337 # endif
cannam@127	338 # endif
cannam@127	339
cannam@127	340 # if defined(HAVE_GETTIMEOFDAY)
cannam@127	341 typedef struct timeval crude_time;
cannam@127	342 # else
cannam@127	343 typedef clock_t crude_time;
cannam@127	344 # endif
cannam@127	345 #endif /* else FAKE_CRUDE_TIME */
cannam@127	346
cannam@127	347 crude_time X(get_crude_time)(void);
cannam@127	348 double X(elapsed_since)(const planner plnr, const problem p,
cannam@127	349 crude_time t0); /* time in seconds since t0 */
cannam@127	350
cannam@127	351 /-----------------------------------------------------------------------/
cannam@127	352 /* ops.c: */
cannam@127	353 /*
cannam@127	354 * ops counter. The total number of additions is add + fma
cannam@127	355 * and the total number of multiplications is mul + fma.
cannam@127	356 * Total flops = add + mul + 2 * fma
cannam@127	357 */
cannam@127	358 typedef struct {
cannam@127	359 double add;
cannam@127	360 double mul;
cannam@127	361 double fma;
cannam@127	362 double other;
cannam@127	363 } opcnt;
cannam@127	364
cannam@127	365 void X(ops_zero)(opcnt *dst);
cannam@127	366 void X(ops_other)(INT o, opcnt *dst);
cannam@127	367 void X(ops_cpy)(const opcnt src, opcnt dst);
cannam@127	368
cannam@127	369 void X(ops_add)(const opcnt a, const opcnt b, opcnt *dst);
cannam@127	370 void X(ops_add2)(const opcnt a, opcnt dst);
cannam@127	371
cannam@127	372 /* dst = m * a + b */
cannam@127	373 void X(ops_madd)(INT m, const opcnt a, const opcnt b, opcnt *dst);
cannam@127	374
cannam@127	375 /* dst += m * a */
cannam@127	376 void X(ops_madd2)(INT m, const opcnt a, opcnt dst);
cannam@127	377
cannam@127	378
cannam@127	379 /-----------------------------------------------------------------------/
cannam@127	380 /* minmax.c: */
cannam@127	381 INT X(imax)(INT a, INT b);
cannam@127	382 INT X(imin)(INT a, INT b);
cannam@127	383
cannam@127	384 /-----------------------------------------------------------------------/
cannam@127	385 /* iabs.c: */
cannam@127	386 INT X(iabs)(INT a);
cannam@127	387
cannam@127	388 /* inline version */
cannam@127	389 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
cannam@127	390
cannam@127	391 /-----------------------------------------------------------------------/
cannam@127	392 /* md5.c */
cannam@127	393
cannam@127	394 #if SIZEOF_UNSIGNED_INT >= 4
cannam@127	395 typedef unsigned int md5uint;
cannam@127	396 #else
cannam@127	397 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
cannam@127	398 #endif
cannam@127	399
cannam@127	400 typedef md5uint md5sig[4];
cannam@127	401
cannam@127	402 typedef struct {
cannam@127	403 md5sig s; /* state and signature */
cannam@127	404
cannam@127	405 /* fields not meant to be used outside md5.c: */
cannam@127	406 unsigned char c[64]; /* stuff not yet processed */
cannam@127	407 unsigned l; /* total length. Should be 64 bits long, but this is
cannam@127	408 good enough for us */
cannam@127	409 } md5;
cannam@127	410
cannam@127	411 void X(md5begin)(md5 *p);
cannam@127	412 void X(md5putb)(md5 p, const void d_, size_t len);
cannam@127	413 void X(md5puts)(md5 p, const char s);
cannam@127	414 void X(md5putc)(md5 *p, unsigned char c);
cannam@127	415 void X(md5int)(md5 *p, int i);
cannam@127	416 void X(md5INT)(md5 *p, INT i);
cannam@127	417 void X(md5unsigned)(md5 *p, unsigned i);
cannam@127	418 void X(md5end)(md5 *p);
cannam@127	419
cannam@127	420 /-----------------------------------------------------------------------/
cannam@127	421 /* tensor.c: */
cannam@127	422 #define STRUCT_HACK_KR
cannam@127	423 #undef STRUCT_HACK_C99
cannam@127	424
cannam@127	425 typedef struct {
cannam@127	426 INT n;
cannam@127	427 INT is; /* input stride */
cannam@127	428 INT os; /* output stride */
cannam@127	429 } iodim;
cannam@127	430
cannam@127	431 typedef struct {
cannam@127	432 int rnk;
cannam@127	433 #if defined(STRUCT_HACK_KR)
cannam@127	434 iodim dims[1];
cannam@127	435 #elif defined(STRUCT_HACK_C99)
cannam@127	436 iodim dims[];
cannam@127	437 #else
cannam@127	438 iodim *dims;
cannam@127	439 #endif
cannam@127	440 } tensor;
cannam@127	441
cannam@127	442 /*
cannam@127	443 Definition of rank -infinity.
cannam@127	444 This definition has the property that if you want rank 0 or 1,
cannam@127	445 you can simply test for rank <= 1. This is a common case.
cannam@127	446
cannam@127	447 A tensor of rank -infinity has size 0.
cannam@127	448 */
cannam@127	449 #define RNK_MINFTY INT_MAX
cannam@127	450 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
cannam@127	451
cannam@127	452 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
cannam@127	453
cannam@127	454 tensor *X(mktensor)(int rnk);
cannam@127	455 tensor *X(mktensor_0d)(void);
cannam@127	456 tensor *X(mktensor_1d)(INT n, INT is, INT os);
cannam@127	457 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
cannam@127	458 INT n1, INT is1, INT os1);
cannam@127	459 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
cannam@127	460 INT n1, INT is1, INT os1,
cannam@127	461 INT n2, INT is2, INT os2);
cannam@127	462 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
cannam@127	463 INT n1, INT is1, INT os1,
cannam@127	464 INT n2, INT is2, INT os2,
cannam@127	465 INT n3, INT is3, INT os3);
cannam@127	466 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
cannam@127	467 INT n1, INT is1, INT os1,
cannam@127	468 INT n2, INT is2, INT os2,
cannam@127	469 INT n3, INT is3, INT os3,
cannam@127	470 INT n4, INT is4, INT os4);
cannam@127	471 INT X(tensor_sz)(const tensor *sz);
cannam@127	472 void X(tensor_md5)(md5 p, const tensor t);
cannam@127	473 INT X(tensor_max_index)(const tensor *sz);
cannam@127	474 INT X(tensor_min_istride)(const tensor *sz);
cannam@127	475 INT X(tensor_min_ostride)(const tensor *sz);
cannam@127	476 INT X(tensor_min_stride)(const tensor *sz);
cannam@127	477 int X(tensor_inplace_strides)(const tensor *sz);
cannam@127	478 int X(tensor_inplace_strides2)(const tensor a, const tensor b);
cannam@127	479 int X(tensor_strides_decrease)(const tensor sz, const tensor vecsz,
cannam@127	480 inplace_kind k);
cannam@127	481 tensor X(tensor_copy)(const tensor sz);
cannam@127	482 int X(tensor_kosherp)(const tensor *x);
cannam@127	483
cannam@127	484 tensor X(tensor_copy_inplace)(const tensor sz, inplace_kind k);
cannam@127	485 tensor X(tensor_copy_except)(const tensor sz, int except_dim);
cannam@127	486 tensor X(tensor_copy_sub)(const tensor sz, int start_dim, int rnk);
cannam@127	487 tensor X(tensor_compress)(const tensor sz);
cannam@127	488 tensor X(tensor_compress_contiguous)(const tensor sz);
cannam@127	489 tensor X(tensor_append)(const tensor a, const tensor *b);
cannam@127	490 void X(tensor_split)(const tensor sz, tensor a, int a_rnk, tensor *b);
cannam@127	491 int X(tensor_tornk1)(const tensor t, INT n, INT is, INT os);
cannam@127	492 void X(tensor_destroy)(tensor *sz);
cannam@127	493 void X(tensor_destroy2)(tensor a, tensor b);
cannam@127	494 void X(tensor_destroy4)(tensor a, tensor b, tensor c, tensor d);
cannam@127	495 void X(tensor_print)(const tensor sz, printer p);
cannam@127	496 int X(dimcmp)(const iodim a, const iodim b);
cannam@127	497 int X(tensor_equal)(const tensor a, const tensor b);
cannam@127	498 int X(tensor_inplace_locations)(const tensor sz, const tensor vecsz);
cannam@127	499
cannam@127	500 /-----------------------------------------------------------------------/
cannam@127	501 /* problem.c: */
cannam@127	502 enum {
cannam@127	503 /* a problem that cannot be solved */
cannam@127	504 PROBLEM_UNSOLVABLE,
cannam@127	505
cannam@127	506 PROBLEM_DFT,
cannam@127	507 PROBLEM_RDFT,
cannam@127	508 PROBLEM_RDFT2,
cannam@127	509
cannam@127	510 /* for mpi/ subdirectory */
cannam@127	511 PROBLEM_MPI_DFT,
cannam@127	512 PROBLEM_MPI_RDFT,
cannam@127	513 PROBLEM_MPI_RDFT2,
cannam@127	514 PROBLEM_MPI_TRANSPOSE,
cannam@127	515
cannam@127	516 PROBLEM_LAST
cannam@127	517 };
cannam@127	518
cannam@127	519 typedef struct {
cannam@127	520 int problem_kind;
cannam@127	521 void (hash) (const problem ego, md5 *p);
cannam@127	522 void (zero) (const problem ego);
cannam@127	523 void (print) (const problem ego, printer *p);
cannam@127	524 void (destroy) (problem ego);
cannam@127	525 } problem_adt;
cannam@127	526
cannam@127	527 struct problem_s {
cannam@127	528 const problem_adt *adt;
cannam@127	529 };
cannam@127	530
cannam@127	531 problem X(mkproblem)(size_t sz, const problem_adt adt);
cannam@127	532 void X(problem_destroy)(problem *ego);
cannam@127	533 problem *X(mkproblem_unsolvable)(void);
cannam@127	534
cannam@127	535 /-----------------------------------------------------------------------/
cannam@127	536 /* print.c */
cannam@127	537 struct printer_s {
cannam@127	538 void (print)(printer p, const char *format, ...);
cannam@127	539 void (vprint)(printer p, const char *format, va_list ap);
cannam@127	540 void (putchr)(printer p, char c);
cannam@127	541 void (cleanup)(printer p);
cannam@127	542 int indent;
cannam@127	543 int indent_incr;
cannam@127	544 };
cannam@127	545
cannam@127	546 printer *X(mkprinter)(size_t size,
cannam@127	547 void (putchr)(printer p, char c),
cannam@127	548 void (cleanup)(printer p));
cannam@127	549 IFFTW_EXTERN void X(printer_destroy)(printer *p);
cannam@127	550
cannam@127	551 /-----------------------------------------------------------------------/
cannam@127	552 /* scan.c */
cannam@127	553 struct scanner_s {
cannam@127	554 int (scan)(scanner sc, const char *format, ...);
cannam@127	555 int (vscan)(scanner sc, const char *format, va_list ap);
cannam@127	556 int (getchr)(scanner sc);
cannam@127	557 int ungotc;
cannam@127	558 };
cannam@127	559
cannam@127	560 scanner X(mkscanner)(size_t size, int (getchr)(scanner *sc));
cannam@127	561 void X(scanner_destroy)(scanner *sc);
cannam@127	562
cannam@127	563 /-----------------------------------------------------------------------/
cannam@127	564 /* plan.c: */
cannam@127	565
cannam@127	566 enum wakefulness {
cannam@127	567 SLEEPY,
cannam@127	568 AWAKE_ZERO,
cannam@127	569 AWAKE_SQRTN_TABLE,
cannam@127	570 AWAKE_SINCOS
cannam@127	571 };
cannam@127	572
cannam@127	573 typedef struct {
cannam@127	574 void (solve)(const plan ego, const problem *p);
cannam@127	575 void (awake)(plan ego, enum wakefulness wakefulness);
cannam@127	576 void (print)(const plan ego, printer *p);
cannam@127	577 void (destroy)(plan ego);
cannam@127	578 } plan_adt;
cannam@127	579
cannam@127	580 struct plan_s {
cannam@127	581 const plan_adt *adt;
cannam@127	582 opcnt ops;
cannam@127	583 double pcost;
cannam@127	584 enum wakefulness wakefulness; /* used for debugging only */
cannam@127	585 int could_prune_now_p;
cannam@127	586 };
cannam@127	587
cannam@127	588 plan X(mkplan)(size_t size, const plan_adt adt);
cannam@127	589 void X(plan_destroy_internal)(plan *ego);
cannam@127	590 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
cannam@127	591 void X(plan_null_destroy)(plan *ego);
cannam@127	592
cannam@127	593 /-----------------------------------------------------------------------/
cannam@127	594 /* solver.c: */
cannam@127	595 typedef struct {
cannam@127	596 int problem_kind;
cannam@127	597 plan (mkplan)(const solver ego, const problem p, planner *plnr);
cannam@127	598 void (destroy)(solver ego);
cannam@127	599 } solver_adt;
cannam@127	600
cannam@127	601 struct solver_s {
cannam@127	602 const solver_adt *adt;
cannam@127	603 int refcnt;
cannam@127	604 };
cannam@127	605
cannam@127	606 solver X(mksolver)(size_t size, const solver_adt adt);
cannam@127	607 void X(solver_use)(solver *ego);
cannam@127	608 void X(solver_destroy)(solver *ego);
cannam@127	609 void X(solver_register)(planner plnr, solver s);
cannam@127	610
cannam@127	611 /* shorthand */
cannam@127	612 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
cannam@127	613
cannam@127	614 /-----------------------------------------------------------------------/
cannam@127	615 /* planner.c */
cannam@127	616
cannam@127	617 typedef struct slvdesc_s {
cannam@127	618 solver *slv;
cannam@127	619 const char *reg_nam;
cannam@127	620 unsigned nam_hash;
cannam@127	621 int reg_id;
cannam@127	622 int next_for_same_problem_kind;
cannam@127	623 } slvdesc;
cannam@127	624
cannam@127	625 typedef struct solution_s solution; /* opaque */
cannam@127	626
cannam@127	627 /* interpretation of L and U:
cannam@127	628
cannam@127	629 - if it returns a plan, the planner guarantees that all applicable
cannam@127	630 plans at least as impatient as U have been tried, and that each
cannam@127	631 plan in the solution is at least as impatient as L.
cannam@127	632
cannam@127	633 - if it returns 0, the planner guarantees to have tried all solvers
cannam@127	634 at least as impatient as L, and that none of them was applicable.
cannam@127	635
cannam@127	636 The structure is packed to fit into 64 bits.
cannam@127	637 */
cannam@127	638
cannam@127	639 typedef struct {
cannam@127	640 unsigned l:20;
cannam@127	641 unsigned hash_info:3;
cannam@127	642 # define BITS_FOR_TIMELIMIT 9
cannam@127	643 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
cannam@127	644 unsigned u:20;
cannam@127	645
cannam@127	646 /* abstraction break: we store the solver here to pad the
cannam@127	647 structure to 64 bits. Otherwise, the struct is padded to 64
cannam@127	648 bits anyway, and another word is allocated for slvndx. */
cannam@127	649 # define BITS_FOR_SLVNDX 12
cannam@127	650 unsigned slvndx:BITS_FOR_SLVNDX;
cannam@127	651 } flags_t;
cannam@127	652
cannam@127	653 /* impatience flags */
cannam@127	654 enum {
cannam@127	655 BELIEVE_PCOST = 0x0001,
cannam@127	656 ESTIMATE = 0x0002,
cannam@127	657 NO_DFT_R2HC = 0x0004,
cannam@127	658 NO_SLOW = 0x0008,
cannam@127	659 NO_VRECURSE = 0x0010,
cannam@127	660 NO_INDIRECT_OP = 0x0020,
cannam@127	661 NO_LARGE_GENERIC = 0x0040,
cannam@127	662 NO_RANK_SPLITS = 0x0080,
cannam@127	663 NO_VRANK_SPLITS = 0x0100,
cannam@127	664 NO_NONTHREADED = 0x0200,
cannam@127	665 NO_BUFFERING = 0x0400,
cannam@127	666 NO_FIXED_RADIX_LARGE_N = 0x0800,
cannam@127	667 NO_DESTROY_INPUT = 0x1000,
cannam@127	668 NO_SIMD = 0x2000,
cannam@127	669 CONSERVE_MEMORY = 0x4000,
cannam@127	670 NO_DHT_R2HC = 0x8000,
cannam@127	671 NO_UGLY = 0x10000,
cannam@127	672 ALLOW_PRUNING = 0x20000
cannam@127	673 };
cannam@127	674
cannam@127	675 /* hashtable information */
cannam@127	676 enum {
cannam@127	677 BLESSING = 0x1u, /* save this entry */
cannam@127	678 H_VALID = 0x2u, /* valid hastable entry */
cannam@127	679 H_LIVE = 0x4u /* entry is nonempty, implies H_VALID */
cannam@127	680 };
cannam@127	681
cannam@127	682 #define PLNR_L(plnr) ((plnr)->flags.l)
cannam@127	683 #define PLNR_U(plnr) ((plnr)->flags.u)
cannam@127	684 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
cannam@127	685
cannam@127	686 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
cannam@127	687 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
cannam@127	688 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
cannam@127	689
cannam@127	690 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
cannam@127	691 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
cannam@127	692 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
cannam@127	693 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
cannam@127	694 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
cannam@127	695 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
cannam@127	696 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
cannam@127	697 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
cannam@127	698 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
cannam@127	699 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
cannam@127	700 #define NO_NONTHREADEDP(plnr) \
cannam@127	701 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
cannam@127	702
cannam@127	703 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
cannam@127	704 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
cannam@127	705 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
cannam@127	706 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
cannam@127	707 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
cannam@127	708
cannam@127	709 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
cannam@127	710
cannam@127	711 typedef enum {
cannam@127	712 /* WISDOM_NORMAL: planner may or may not use wisdom */
cannam@127	713 WISDOM_NORMAL,
cannam@127	714
cannam@127	715 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
cannam@127	716 WISDOM_ONLY,
cannam@127	717
cannam@127	718 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
cannam@127	719 WISDOM_IS_BOGUS,
cannam@127	720
cannam@127	721 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
cannam@127	722 WISDOM_IGNORE_INFEASIBLE,
cannam@127	723
cannam@127	724 /* WISDOM_IGNORE_ALL: planner ignores all */
cannam@127	725 WISDOM_IGNORE_ALL
cannam@127	726 } wisdom_state_t;
cannam@127	727
cannam@127	728 typedef struct {
cannam@127	729 void (register_solver)(planner ego, solver *s);
cannam@127	730 plan (mkplan)(planner ego, const problem p);
cannam@127	731 void (forget)(planner ego, amnesia a);
cannam@127	732 void (exprt)(planner ego, printer p); / ``export'' is a reserved
cannam@127	733 word in C++. */
cannam@127	734 int (imprt)(planner ego, scanner *sc);
cannam@127	735 } planner_adt;
cannam@127	736
cannam@127	737 /* hash table of solutions */
cannam@127	738 typedef struct {
cannam@127	739 solution *solutions;
cannam@127	740 unsigned hashsiz, nelem;
cannam@127	741
cannam@127	742 /* statistics */
cannam@127	743 int lookup, succ_lookup, lookup_iter;
cannam@127	744 int insert, insert_iter, insert_unknown;
cannam@127	745 int nrehash;
cannam@127	746 } hashtab;
cannam@127	747
cannam@127	748 typedef enum { COST_SUM, COST_MAX } cost_kind;
cannam@127	749
cannam@127	750 struct planner_s {
cannam@127	751 const planner_adt *adt;
cannam@127	752 void (hook)(struct planner_s plnr, plan *pln,
cannam@127	753 const problem *p, int optimalp);
cannam@127	754 double (cost_hook)(const problem p, double t, cost_kind k);
cannam@127	755 int (wisdom_ok_hook)(const problem p, flags_t flags);
cannam@127	756 void (nowisdom_hook)(const problem p);
cannam@127	757 wisdom_state_t (bogosity_hook)(wisdom_state_t state, const problem p);
cannam@127	758
cannam@127	759 /* solver descriptors */
cannam@127	760 slvdesc *slvdescs;
cannam@127	761 unsigned nslvdesc, slvdescsiz;
cannam@127	762 const char *cur_reg_nam;
cannam@127	763 int cur_reg_id;
cannam@127	764 int slvdescs_for_problem_kind[PROBLEM_LAST];
cannam@127	765
cannam@127	766 wisdom_state_t wisdom_state;
cannam@127	767
cannam@127	768 hashtab htab_blessed;
cannam@127	769 hashtab htab_unblessed;
cannam@127	770
cannam@127	771 int nthr;
cannam@127	772 flags_t flags;
cannam@127	773
cannam@127	774 crude_time start_time;
cannam@127	775 double timelimit; /* elapsed_since(start_time) at which to bail out */
cannam@127	776 int timed_out; /* whether most recent search timed out */
cannam@127	777 int need_timeout_check;
cannam@127	778
cannam@127	779 /* various statistics */
cannam@127	780 int nplan; /* number of plans evaluated */
cannam@127	781 double pcost, epcost; /* total pcost of measured/estimated plans */
cannam@127	782 int nprob; /* number of problems evaluated */
cannam@127	783 };
cannam@127	784
cannam@127	785 planner *X(mkplanner)(void);
cannam@127	786 void X(planner_destroy)(planner *ego);
cannam@127	787
cannam@127	788 /*
cannam@127	789 Iterate over all solvers. Read:
cannam@127	790
cannam@127	791 @article{ baker93iterators,
cannam@127	792 author = "Henry G. Baker, Jr.",
cannam@127	793 title = "Iterators: Signs of Weakness in Object-Oriented Languages",
cannam@127	794 journal = "{ACM} {OOPS} Messenger",
cannam@127	795 volume = "4",
cannam@127	796 number = "3",
cannam@127	797 pages = "18--25"
cannam@127	798 }
cannam@127	799 */
cannam@127	800 #define FORALL_SOLVERS(ego, s, p, what) \
cannam@127	801 { \
cannam@127	802 unsigned _cnt; \
cannam@127	803 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
cannam@127	804 slvdesc *p = ego->slvdescs + _cnt; \
cannam@127	805 solver *s = p->slv; \
cannam@127	806 what; \
cannam@127	807 } \
cannam@127	808 }
cannam@127	809
cannam@127	810 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
cannam@127	811 { \
cannam@127	812 int _cnt = ego->slvdescs_for_problem_kind[kind]; \
cannam@127	813 while (_cnt >= 0) { \
cannam@127	814 slvdesc *p = ego->slvdescs + _cnt; \
cannam@127	815 solver *s = p->slv; \
cannam@127	816 what; \
cannam@127	817 _cnt = p->next_for_same_problem_kind; \
cannam@127	818 } \
cannam@127	819 }
cannam@127	820
cannam@127	821
cannam@127	822 /* make plan, destroy problem */
cannam@127	823 plan X(mkplan_d)(planner ego, problem *p);
cannam@127	824 plan X(mkplan_f_d)(planner ego, problem *p,
cannam@127	825 unsigned l_set, unsigned u_set, unsigned u_reset);
cannam@127	826
cannam@127	827 /-----------------------------------------------------------------------/
cannam@127	828 /* stride.c: */
cannam@127	829
cannam@127	830 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
cannam@127	831 #if (defined(__i386__) \|\| defined(__x86_64__) \|\| _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
cannam@127	832 #define PRECOMPUTE_ARRAY_INDICES
cannam@127	833 #endif
cannam@127	834
cannam@127	835 extern const INT X(an_INT_guaranteed_to_be_zero);
cannam@127	836
cannam@127	837 #ifdef PRECOMPUTE_ARRAY_INDICES
cannam@127	838 typedef INT *stride;
cannam@127	839 #define WS(stride, i) (stride[i])
cannam@127	840 extern stride X(mkstride)(INT n, INT s);
cannam@127	841 void X(stride_destroy)(stride p);
cannam@127	842 /* hackery to prevent the compiler from copying the strides array
cannam@127	843 onto the stack */
cannam@127	844 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
cannam@127	845 #else
cannam@127	846
cannam@127	847 typedef INT stride;
cannam@127	848 #define WS(stride, i) (stride * i)
cannam@127	849 #define fftwf_mkstride(n, stride) stride
cannam@127	850 #define fftw_mkstride(n, stride) stride
cannam@127	851 #define fftwl_mkstride(n, stride) stride
cannam@127	852 #define fftwf_stride_destroy(p) ((void) p)
cannam@127	853 #define fftw_stride_destroy(p) ((void) p)
cannam@127	854 #define fftwl_stride_destroy(p) ((void) p)
cannam@127	855
cannam@127	856 /* hackery to prevent the compiler from ``optimizing'' induction
cannam@127	857 variables in codelet loops. The problem is that for each K and for
cannam@127	858 each expression of the form P[I + STRIDE * K] in a loop, most
cannam@127	859 compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
cannam@127	860 For large values of K this behavior overflows the
cannam@127	861 register set, which is likely worse than doing the index computation
cannam@127	862 in the first place.
cannam@127	863
cannam@127	864 If we guess that there are more than
cannam@127	865 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
cannam@127	866 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
cannam@127	867 be 0, but the compiler does not know this.
cannam@127	868
cannam@127	869 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
cannam@127	870 seem to imply.
cannam@127	871 */
cannam@127	872 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
cannam@127	873 #define MAKE_VOLATILE_STRIDE(nptr, x) \
cannam@127	874 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
cannam@127	875 0 : \
cannam@127	876 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
cannam@127	877 #endif /* PRECOMPUTE_ARRAY_INDICES */
cannam@127	878
cannam@127	879 /-----------------------------------------------------------------------/
cannam@127	880 /* solvtab.c */
cannam@127	881
cannam@127	882 struct solvtab_s { void (reg)(planner ); const char *reg_nam; };
cannam@127	883 typedef struct solvtab_s solvtab[];
cannam@127	884 void X(solvtab_exec)(const solvtab tbl, planner *p);
cannam@127	885 #define SOLVTAB(s) { s, STRINGIZE(s) }
cannam@127	886 #define SOLVTAB_END { 0, 0 }
cannam@127	887
cannam@127	888 /-----------------------------------------------------------------------/
cannam@127	889 /* pickdim.c */
cannam@127	890 int X(pickdim)(int which_dim, const int *buddies, size_t nbuddies,
cannam@127	891 const tensor sz, int oop, int dp);
cannam@127	892
cannam@127	893 /-----------------------------------------------------------------------/
cannam@127	894 /* twiddle.c */
cannam@127	895 /* little language to express twiddle factors computation */
cannam@127	896 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
cannam@127	897 TW_FULL = 4, TW_HALF = 5 };
cannam@127	898
cannam@127	899 typedef struct {
cannam@127	900 unsigned char op;
cannam@127	901 signed char v;
cannam@127	902 short i;
cannam@127	903 } tw_instr;
cannam@127	904
cannam@127	905 typedef struct twid_s {
cannam@127	906 R W; / array of twiddle factors */
cannam@127	907 INT n, r, m; /* transform order, radix, # twiddle rows */
cannam@127	908 int refcnt;
cannam@127	909 const tw_instr *instr;
cannam@127	910 struct twid_s *cdr;
cannam@127	911 enum wakefulness wakefulness;
cannam@127	912 } twid;
cannam@127	913
cannam@127	914 INT X(twiddle_length)(INT r, const tw_instr *p);
cannam@127	915 void X(twiddle_awake)(enum wakefulness wakefulness,
cannam@127	916 twid *pp, const tw_instr instr, INT n, INT r, INT m);
cannam@127	917
cannam@127	918 /-----------------------------------------------------------------------/
cannam@127	919 /* trig.c */
cannam@127	920 #if defined(TRIGREAL_IS_LONG_DOUBLE)
cannam@127	921 typedef long double trigreal;
cannam@127	922 #elif defined(TRIGREAL_IS_QUAD)
cannam@127	923 typedef __float128 trigreal;
cannam@127	924 #else
cannam@127	925 typedef double trigreal;
cannam@127	926 #endif
cannam@127	927
cannam@127	928 typedef struct triggen_s triggen;
cannam@127	929
cannam@127	930 struct triggen_s {
cannam@127	931 void (cexp)(triggen t, INT m, R *result);
cannam@127	932 void (cexpl)(triggen t, INT m, trigreal *result);
cannam@127	933 void (rotate)(triggen p, INT m, R xr, R xi, R *res);
cannam@127	934
cannam@127	935 INT twshft;
cannam@127	936 INT twradix;
cannam@127	937 INT twmsk;
cannam@127	938 trigreal W0, W1;
cannam@127	939 INT n;
cannam@127	940 };
cannam@127	941
cannam@127	942 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
cannam@127	943 void X(triggen_destroy)(triggen *p);
cannam@127	944
cannam@127	945 /-----------------------------------------------------------------------/
cannam@127	946 /* primes.c: */
cannam@127	947
cannam@127	948 #define MULMOD(x, y, p) \
cannam@127	949 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
cannam@127	950
cannam@127	951 INT X(safe_mulmod)(INT x, INT y, INT p);
cannam@127	952 INT X(power_mod)(INT n, INT m, INT p);
cannam@127	953 INT X(find_generator)(INT p);
cannam@127	954 INT X(first_divisor)(INT n);
cannam@127	955 int X(is_prime)(INT n);
cannam@127	956 INT X(next_prime)(INT n);
cannam@127	957 int X(factors_into)(INT n, const INT *primes);
cannam@127	958 int X(factors_into_small_primes)(INT n);
cannam@127	959 INT X(choose_radix)(INT r, INT n);
cannam@127	960 INT X(isqrt)(INT n);
cannam@127	961 INT X(modulo)(INT a, INT n);
cannam@127	962
cannam@127	963 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
cannam@127	964
cannam@127	965 /* thresholds below which certain solvers are considered SLOW. These are guesses
cannam@127	966 believed to be conservative */
cannam@127	967 #define GENERIC_MAX_SLOW 16
cannam@127	968 #define RADER_MAX_SLOW 32
cannam@127	969 #define BLUESTEIN_MAX_SLOW 24
cannam@127	970
cannam@127	971 /-----------------------------------------------------------------------/
cannam@127	972 /* rader.c: */
cannam@127	973 typedef struct rader_tls rader_tl;
cannam@127	974
cannam@127	975 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R W, rader_tl *tl);
cannam@127	976 R X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl t);
cannam@127	977 void X(rader_tl_delete)(R W, rader_tl *tl);
cannam@127	978
cannam@127	979 /-----------------------------------------------------------------------/
cannam@127	980 /* copy/transposition routines */
cannam@127	981
cannam@127	982 /* lower bound to the cache size, for tiled routines */
cannam@127	983 #define CACHESIZE 8192
cannam@127	984
cannam@127	985 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
cannam@127	986
cannam@127	987 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
cannam@127	988 void (f)(INT n0l, INT n0u, INT n1l, INT n1u, void args),
cannam@127	989 void *args);
cannam@127	990 void X(cpy1d)(R I, R O, INT n0, INT is0, INT os0, INT vl);
cannam@127	991 void X(zero1d_pair)(R O0, R O1, INT n0, INT os0);
cannam@127	992 void X(cpy2d)(R I, R O,
cannam@127	993 INT n0, INT is0, INT os0,
cannam@127	994 INT n1, INT is1, INT os1,
cannam@127	995 INT vl);
cannam@127	996 void X(cpy2d_ci)(R I, R O,
cannam@127	997 INT n0, INT is0, INT os0,
cannam@127	998 INT n1, INT is1, INT os1,
cannam@127	999 INT vl);
cannam@127	1000 void X(cpy2d_co)(R I, R O,
cannam@127	1001 INT n0, INT is0, INT os0,
cannam@127	1002 INT n1, INT is1, INT os1,
cannam@127	1003 INT vl);
cannam@127	1004 void X(cpy2d_tiled)(R I, R O,
cannam@127	1005 INT n0, INT is0, INT os0,
cannam@127	1006 INT n1, INT is1, INT os1,
cannam@127	1007 INT vl);
cannam@127	1008 void X(cpy2d_tiledbuf)(R I, R O,
cannam@127	1009 INT n0, INT is0, INT os0,
cannam@127	1010 INT n1, INT is1, INT os1,
cannam@127	1011 INT vl);
cannam@127	1012 void X(cpy2d_pair)(R I0, R I1, R O0, R O1,
cannam@127	1013 INT n0, INT is0, INT os0,
cannam@127	1014 INT n1, INT is1, INT os1);
cannam@127	1015 void X(cpy2d_pair_ci)(R I0, R I1, R O0, R O1,
cannam@127	1016 INT n0, INT is0, INT os0,
cannam@127	1017 INT n1, INT is1, INT os1);
cannam@127	1018 void X(cpy2d_pair_co)(R I0, R I1, R O0, R O1,
cannam@127	1019 INT n0, INT is0, INT os0,
cannam@127	1020 INT n1, INT is1, INT os1);
cannam@127	1021
cannam@127	1022 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
cannam@127	1023 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
cannam@127	1024 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
cannam@127	1025
cannam@127	1026 typedef void (transpose_func)(R I, INT n, INT s0, INT s1, INT vl);
cannam@127	1027 typedef void (cpy2d_func)(R I, R *O,
cannam@127	1028 INT n0, INT is0, INT os0,
cannam@127	1029 INT n1, INT is1, INT os1,
cannam@127	1030 INT vl);
cannam@127	1031
cannam@127	1032 /-----------------------------------------------------------------------/
cannam@127	1033 /* misc stuff */
cannam@127	1034 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
cannam@127	1035 double X(iestimate_cost)(const planner , const plan , const problem *);
cannam@127	1036
cannam@127	1037 #ifdef FFTW_RANDOM_ESTIMATOR
cannam@127	1038 extern unsigned X(random_estimate_seed);
cannam@127	1039 #endif
cannam@127	1040
cannam@127	1041 double X(measure_execution_time)(const planner *plnr,
cannam@127	1042 plan pln, const problem p);
cannam@127	1043 IFFTW_EXTERN int X(ialignment_of)(R *p);
cannam@127	1044 unsigned X(hash)(const char *s);
cannam@127	1045 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
cannam@127	1046 int X(nbuf_redundant)(INT n, INT vl, size_t which,
cannam@127	1047 const INT *maxnbuf, size_t nmaxnbuf);
cannam@127	1048 INT X(bufdist)(INT n, INT vl);
cannam@127	1049 int X(toobig)(INT n);
cannam@127	1050 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
cannam@127	1051
cannam@127	1052 #if HAVE_SIMD
cannam@127	1053 R X(taint)(R p, INT s);
cannam@127	1054 R X(join_taint)(R p1, R *p2);
cannam@127	1055 #define TAINT(p, s) X(taint)(p, s)
cannam@127	1056 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
cannam@127	1057 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
cannam@127	1058 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
cannam@127	1059 #else
cannam@127	1060 #define TAINT(p, s) (p)
cannam@127	1061 #define UNTAINT(p) (p)
cannam@127	1062 #define TAINTOF(p) 0
cannam@127	1063 #define JOIN_TAINT(p1, p2) p1
cannam@127	1064 #endif
cannam@127	1065
cannam@127	1066 #ifdef FFTW_DEBUG_ALIGNMENT
cannam@127	1067 # define ASSERT_ALIGNED_DOUBLE { \
cannam@127	1068 double __foo; \
cannam@127	1069 CK(!(((uintptr_t) &__foo) & 0x7)); \
cannam@127	1070 }
cannam@127	1071 #else
cannam@127	1072 # define ASSERT_ALIGNED_DOUBLE
cannam@127	1073 #endif /* FFTW_DEBUG_ALIGNMENT */
cannam@127	1074
cannam@127	1075
cannam@127	1076
cannam@127	1077 /-----------------------------------------------------------------------/
cannam@127	1078 /* macros used in codelets to reduce source code size */
cannam@127	1079
cannam@127	1080 typedef R E; /* internal precision of codelets. */
cannam@127	1081
cannam@127	1082 #if defined(FFTW_LDOUBLE)
cannam@127	1083 # define K(x) ((E) x##L)
cannam@127	1084 #elif defined(FFTW_QUAD)
cannam@127	1085 # define K(x) ((E) x##Q)
cannam@127	1086 #else
cannam@127	1087 # define K(x) ((E) x)
cannam@127	1088 #endif
cannam@127	1089 #define DK(name, value) const E name = K(value)
cannam@127	1090
cannam@127	1091 /* FMA macros */
cannam@127	1092
cannam@127	1093 #if defined(__GNUC__) && (defined(__powerpc__) \|\| defined(__ppc__) \|\| defined(_POWER))
cannam@127	1094 /* The obvious expression a * b + c does not work. If both x = a * b
cannam@127	1095 + c and y = a * b - c appear in the source, gcc computes t = a * b,
cannam@127	1096 x = t + c, y = t - c, thus destroying the fma.
cannam@127	1097
cannam@127	1098 This peculiar coding seems to do the right thing on all of
cannam@127	1099 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
cannam@127	1100 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
cannam@127	1101 `x' for the single-assignment form).
cannam@127	1102
cannam@127	1103 However, gcc-4.0 is a formidable adversary which succeeds in
cannam@127	1104 pessimizing two fma's into one multiplication and two additions.
cannam@127	1105 It does it very early in the game---before the optimization passes
cannam@127	1106 even start. The only real workaround seems to use fake inline asm
cannam@127	1107 such as
cannam@127	1108
cannam@127	1109 asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
cannam@127	1110 return a * b + c;
cannam@127	1111
cannam@127	1112 in each of the FMA, FMS, FNMA, and FNMS functions. However, this
cannam@127	1113 does not solve the problem either, because two equal asm statements
cannam@127	1114 count as a common subexpression! One must use different fake asm
cannam@127	1115 statements:
cannam@127	1116
cannam@127	1117 in FMA:
cannam@127	1118 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
cannam@127	1119
cannam@127	1120 in FMS:
cannam@127	1121 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
cannam@127	1122
cannam@127	1123 etc.
cannam@127	1124
cannam@127	1125 After these changes, gcc recalcitrantly generates the fma that was
cannam@127	1126 in the source to begin with. However, the extra asm() cruft
cannam@127	1127 confuses other passes of gcc, notably the instruction scheduler.
cannam@127	1128 (Of course, one could also generate the fma directly via inline
cannam@127	1129 asm, but this confuses the scheduler even more.)
cannam@127	1130
cannam@127	1131 Steven and I have submitted more than one bug report to the gcc
cannam@127	1132 mailing list over the past few years, to no effect. Thus, I give
cannam@127	1133 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
cannam@127	1134 out before touching this crap again.
cannam@127	1135 */
cannam@127	1136 static __inline__ E FMA(E a, E b, E c)
cannam@127	1137 {
cannam@127	1138 E x = a * b;
cannam@127	1139 x = x + c;
cannam@127	1140 return x;
cannam@127	1141 }
cannam@127	1142
cannam@127	1143 static __inline__ E FMS(E a, E b, E c)
cannam@127	1144 {
cannam@127	1145 E x = a * b;
cannam@127	1146 x = x - c;
cannam@127	1147 return x;
cannam@127	1148 }
cannam@127	1149
cannam@127	1150 static __inline__ E FNMA(E a, E b, E c)
cannam@127	1151 {
cannam@127	1152 E x = a * b;
cannam@127	1153 x = - (x + c);
cannam@127	1154 return x;
cannam@127	1155 }
cannam@127	1156
cannam@127	1157 static __inline__ E FNMS(E a, E b, E c)
cannam@127	1158 {
cannam@127	1159 E x = a * b;
cannam@127	1160 x = - (x - c);
cannam@127	1161 return x;
cannam@127	1162 }
cannam@127	1163 #else
cannam@127	1164 #define FMA(a, b, c) (((a) * (b)) + (c))
cannam@127	1165 #define FMS(a, b, c) (((a) * (b)) - (c))
cannam@127	1166 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
cannam@127	1167 #define FNMS(a, b, c) ((c) - ((a) * (b)))
cannam@127	1168 #endif
cannam@127	1169
cannam@127	1170 #ifdef __cplusplus
cannam@127	1171 } /* extern "C" */
cannam@127	1172 #endif /* __cplusplus */
cannam@127	1173
cannam@127	1174 #endif /* __IFFTW_H__ */

Mercurial > hg > sv-dependency-builds

annotate src/fftw-3.3.5/kernel/ifftw.h @ 169:223a55898ab9 tip default