annotate src/fftw-3.3.5/kernel/ifftw.h @ 84:08ae793730bd

Add null config files
author Chris Cannam
date Mon, 02 Mar 2020 14:03:47 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21
Chris@42 22 /* FFTW internal header file */
Chris@42 23 #ifndef __IFFTW_H__
Chris@42 24 #define __IFFTW_H__
Chris@42 25
Chris@42 26 #include "config.h"
Chris@42 27
Chris@42 28 #include <stdlib.h> /* size_t */
Chris@42 29 #include <stdarg.h> /* va_list */
Chris@42 30 #include <stddef.h> /* ptrdiff_t */
Chris@42 31 #include <limits.h> /* INT_MAX */
Chris@42 32
Chris@42 33 #if HAVE_SYS_TYPES_H
Chris@42 34 # include <sys/types.h>
Chris@42 35 #endif
Chris@42 36
Chris@42 37 #if HAVE_STDINT_H
Chris@42 38 # include <stdint.h> /* uintptr_t, maybe */
Chris@42 39 #endif
Chris@42 40
Chris@42 41 #if HAVE_INTTYPES_H
Chris@42 42 # include <inttypes.h> /* uintptr_t, maybe */
Chris@42 43 #endif
Chris@42 44
Chris@42 45 #ifdef __cplusplus
Chris@42 46 extern "C"
Chris@42 47 {
Chris@42 48 #endif /* __cplusplus */
Chris@42 49
Chris@42 50 /* Windows annoyances -- since tests/hook.c uses some internal
Chris@42 51 FFTW functions, we need to given them the dllexport attribute
Chris@42 52 under Windows when compiling as a DLL (see api/fftw3.h). */
Chris@42 53 #if defined(FFTW_EXTERN)
Chris@42 54 # define IFFTW_EXTERN FFTW_EXTERN
Chris@42 55 #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
Chris@42 56 && (defined(_WIN32) || defined(__WIN32__))
Chris@42 57 # define IFFTW_EXTERN extern __declspec(dllexport)
Chris@42 58 #else
Chris@42 59 # define IFFTW_EXTERN extern
Chris@42 60 #endif
Chris@42 61
Chris@42 62 /* determine precision and name-mangling scheme */
Chris@42 63 #define CONCAT(prefix, name) prefix ## name
Chris@42 64 #if defined(FFTW_SINGLE)
Chris@42 65 typedef float R;
Chris@42 66 # define X(name) CONCAT(fftwf_, name)
Chris@42 67 #elif defined(FFTW_LDOUBLE)
Chris@42 68 typedef long double R;
Chris@42 69 # define X(name) CONCAT(fftwl_, name)
Chris@42 70 # define TRIGREAL_IS_LONG_DOUBLE
Chris@42 71 #elif defined(FFTW_QUAD)
Chris@42 72 typedef __float128 R;
Chris@42 73 # define X(name) CONCAT(fftwq_, name)
Chris@42 74 # define TRIGREAL_IS_QUAD
Chris@42 75 #else
Chris@42 76 typedef double R;
Chris@42 77 # define X(name) CONCAT(fftw_, name)
Chris@42 78 #endif
Chris@42 79
Chris@42 80 /*
Chris@42 81 integral type large enough to contain a stride (what ``int'' should
Chris@42 82 have been in the first place.
Chris@42 83 */
Chris@42 84 typedef ptrdiff_t INT;
Chris@42 85
Chris@42 86 /* dummy use of unused parameters to silence compiler warnings */
Chris@42 87 #define UNUSED(x) (void)x
Chris@42 88
Chris@42 89 #define NELEM(array) ((sizeof(array) / sizeof((array)[0])))
Chris@42 90
Chris@42 91 #define FFT_SIGN (-1) /* sign convention for forward transforms */
Chris@42 92 extern void X(extract_reim)(int sign, R *c, R **r, R **i);
Chris@42 93
Chris@42 94 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
Chris@42 95
Chris@42 96 #define STRINGIZEx(x) #x
Chris@42 97 #define STRINGIZE(x) STRINGIZEx(x)
Chris@42 98 #define CIMPLIES(ante, post) (!(ante) || (post))
Chris@42 99
Chris@42 100 /* define HAVE_SIMD if any simd extensions are supported */
Chris@42 101 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || \
Chris@42 102 defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \
Chris@42 103 defined(HAVE_AVX2) || defined(HAVE_AVX512) || \
Chris@42 104 defined(HAVE_KCVI) || \
Chris@42 105 defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \
Chris@42 106 defined(HAVE_MIPS_PS) || \
Chris@42 107 defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256)
Chris@42 108 #define HAVE_SIMD 1
Chris@42 109 #else
Chris@42 110 #define HAVE_SIMD 0
Chris@42 111 #endif
Chris@42 112
Chris@42 113 extern int X(have_simd_sse2)(void);
Chris@42 114 extern int X(have_simd_avx)(void);
Chris@42 115 extern int X(have_simd_avx_128_fma)(void);
Chris@42 116 extern int X(have_simd_avx2)(void);
Chris@42 117 extern int X(have_simd_avx2_128)(void);
Chris@42 118 extern int X(have_simd_avx512)(void);
Chris@42 119 extern int X(have_simd_altivec)(void);
Chris@42 120 extern int X(have_simd_vsx)(void);
Chris@42 121 extern int X(have_simd_neon)(void);
Chris@42 122
Chris@42 123 /* forward declarations */
Chris@42 124 typedef struct problem_s problem;
Chris@42 125 typedef struct plan_s plan;
Chris@42 126 typedef struct solver_s solver;
Chris@42 127 typedef struct planner_s planner;
Chris@42 128 typedef struct printer_s printer;
Chris@42 129 typedef struct scanner_s scanner;
Chris@42 130
Chris@42 131 /*-----------------------------------------------------------------------*/
Chris@42 132 /* alloca: */
Chris@42 133 #if HAVE_SIMD
Chris@42 134 # if defined(HAVE_KCVI) || defined(HAVE_AVX512)
Chris@42 135 # define MIN_ALIGNMENT 64
Chris@42 136 # elif defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_GENERIC_SIMD256)
Chris@42 137 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
Chris@42 138 * everything else */
Chris@42 139 # else
Chris@42 140 /* Note that we cannot use 32-byte alignment for all SIMD. For
Chris@42 141 example, MacOS X malloc is 16-byte aligned, but there was no
Chris@42 142 posix_memalign in MacOS X until version 10.6. */
Chris@42 143 # define MIN_ALIGNMENT 16
Chris@42 144 # endif
Chris@42 145 #endif
Chris@42 146
Chris@42 147 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
Chris@42 148 /* use alloca if available */
Chris@42 149
Chris@42 150 #ifndef alloca
Chris@42 151 #ifdef __GNUC__
Chris@42 152 # define alloca __builtin_alloca
Chris@42 153 #else
Chris@42 154 # ifdef _MSC_VER
Chris@42 155 # include <malloc.h>
Chris@42 156 # define alloca _alloca
Chris@42 157 # else
Chris@42 158 # if HAVE_ALLOCA_H
Chris@42 159 # include <alloca.h>
Chris@42 160 # else
Chris@42 161 # ifdef _AIX
Chris@42 162 #pragma alloca
Chris@42 163 # else
Chris@42 164 # ifndef alloca /* predefined by HP cc +Olibcalls */
Chris@42 165 void *alloca(size_t);
Chris@42 166 # endif
Chris@42 167 # endif
Chris@42 168 # endif
Chris@42 169 # endif
Chris@42 170 #endif
Chris@42 171 #endif
Chris@42 172
Chris@42 173 # ifdef MIN_ALIGNMENT
Chris@42 174 # define STACK_MALLOC(T, p, n) \
Chris@42 175 { \
Chris@42 176 p = (T)alloca((n) + MIN_ALIGNMENT); \
Chris@42 177 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
Chris@42 178 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \
Chris@42 179 }
Chris@42 180 # define STACK_FREE(n)
Chris@42 181 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
Chris@42 182 # define STACK_MALLOC(T, p, n) p = (T)alloca(n)
Chris@42 183 # define STACK_FREE(n)
Chris@42 184 # endif
Chris@42 185
Chris@42 186 #else /* ! HAVE_ALLOCA */
Chris@42 187 /* use malloc instead of alloca */
Chris@42 188 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
Chris@42 189 # define STACK_FREE(n) X(ifree)(n)
Chris@42 190 #endif /* ! HAVE_ALLOCA */
Chris@42 191
Chris@42 192 /* allocation of buffers. If these grow too large use malloc(), else
Chris@42 193 use STACK_MALLOC (hopefully reducing to alloca()). */
Chris@42 194
Chris@42 195 /* 64KiB ought to be enough for anybody */
Chris@42 196 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
Chris@42 197
Chris@42 198 #define BUF_ALLOC(T, p, n) \
Chris@42 199 { \
Chris@42 200 if (n < MAX_STACK_ALLOC) { \
Chris@42 201 STACK_MALLOC(T, p, n); \
Chris@42 202 } else { \
Chris@42 203 p = (T)MALLOC(n, BUFFERS); \
Chris@42 204 } \
Chris@42 205 }
Chris@42 206
Chris@42 207 #define BUF_FREE(p, n) \
Chris@42 208 { \
Chris@42 209 if (n < MAX_STACK_ALLOC) { \
Chris@42 210 STACK_FREE(p); \
Chris@42 211 } else { \
Chris@42 212 X(ifree)(p); \
Chris@42 213 } \
Chris@42 214 }
Chris@42 215
Chris@42 216 /*-----------------------------------------------------------------------*/
Chris@42 217 /* define uintptr_t if it is not already defined */
Chris@42 218
Chris@42 219 #ifndef HAVE_UINTPTR_T
Chris@42 220 # if SIZEOF_VOID_P == 0
Chris@42 221 # error sizeof void* is unknown!
Chris@42 222 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
Chris@42 223 typedef unsigned int uintptr_t;
Chris@42 224 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
Chris@42 225 typedef unsigned long uintptr_t;
Chris@42 226 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
Chris@42 227 typedef unsigned long long uintptr_t;
Chris@42 228 # else
Chris@42 229 # error no unsigned integer type matches void* sizeof!
Chris@42 230 # endif
Chris@42 231 #endif
Chris@42 232
Chris@42 233 /*-----------------------------------------------------------------------*/
Chris@42 234 /* We can do an optimization for copying pairs of (aligned) floats
Chris@42 235 when in single precision if 2*float = double. */
Chris@42 236
Chris@42 237 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
Chris@42 238 && SIZEOF_FLOAT != 0 \
Chris@42 239 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
Chris@42 240
Chris@42 241 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
Chris@42 242
Chris@42 243 /*-----------------------------------------------------------------------*/
Chris@42 244 /* assert.c: */
Chris@42 245 IFFTW_EXTERN void X(assertion_failed)(const char *s,
Chris@42 246 int line, const char *file);
Chris@42 247
Chris@42 248 /* always check */
Chris@42 249 #define CK(ex) \
Chris@42 250 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@42 251
Chris@42 252 #ifdef FFTW_DEBUG
Chris@42 253 /* check only if debug enabled */
Chris@42 254 #define A(ex) \
Chris@42 255 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@42 256 #else
Chris@42 257 #define A(ex) /* nothing */
Chris@42 258 #endif
Chris@42 259
Chris@42 260 extern void X(debug)(const char *format, ...);
Chris@42 261 #define D X(debug)
Chris@42 262
Chris@42 263 /*-----------------------------------------------------------------------*/
Chris@42 264 /* kalloc.c: */
Chris@42 265 extern void *X(kernel_malloc)(size_t n);
Chris@42 266 extern void X(kernel_free)(void *p);
Chris@42 267
Chris@42 268 /*-----------------------------------------------------------------------*/
Chris@42 269 /* alloc.c: */
Chris@42 270
Chris@42 271 /* objects allocated by malloc, for statistical purposes */
Chris@42 272 enum malloc_tag {
Chris@42 273 EVERYTHING,
Chris@42 274 PLANS,
Chris@42 275 SOLVERS,
Chris@42 276 PROBLEMS,
Chris@42 277 BUFFERS,
Chris@42 278 HASHT,
Chris@42 279 TENSORS,
Chris@42 280 PLANNERS,
Chris@42 281 SLVDESCS,
Chris@42 282 TWIDDLES,
Chris@42 283 STRIDES,
Chris@42 284 OTHER,
Chris@42 285 MALLOC_WHAT_LAST /* must be last */
Chris@42 286 };
Chris@42 287
Chris@42 288 IFFTW_EXTERN void X(ifree)(void *ptr);
Chris@42 289 extern void X(ifree0)(void *ptr);
Chris@42 290
Chris@42 291 #ifdef FFTW_DEBUG_MALLOC
Chris@42 292
Chris@42 293 IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what,
Chris@42 294 const char *file, int line);
Chris@42 295 #define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
Chris@42 296 IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose);
Chris@42 297
Chris@42 298 #else /* ! FFTW_DEBUG_MALLOC */
Chris@42 299
Chris@42 300 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
Chris@42 301 #define MALLOC(n, what) X(malloc_plain)(n)
Chris@42 302
Chris@42 303 #endif
Chris@42 304
Chris@42 305 #if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) || defined(HAVE_OPENMP))
Chris@42 306 extern int X(in_thread);
Chris@42 307 # define IN_THREAD X(in_thread)
Chris@42 308 # define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
Chris@42 309 # define THREAD_OFF X(in_thread) = in_thread_save; }
Chris@42 310 #else
Chris@42 311 # define IN_THREAD 0
Chris@42 312 # define THREAD_ON
Chris@42 313 # define THREAD_OFF
Chris@42 314 #endif
Chris@42 315
Chris@42 316 /*-----------------------------------------------------------------------*/
Chris@42 317 /* low-resolution clock */
Chris@42 318
Chris@42 319 #ifdef FAKE_CRUDE_TIME
Chris@42 320 typedef int crude_time;
Chris@42 321 #else
Chris@42 322 # if TIME_WITH_SYS_TIME
Chris@42 323 # include <sys/time.h>
Chris@42 324 # include <time.h>
Chris@42 325 # else
Chris@42 326 # if HAVE_SYS_TIME_H
Chris@42 327 # include <sys/time.h>
Chris@42 328 # else
Chris@42 329 # include <time.h>
Chris@42 330 # endif
Chris@42 331 # endif
Chris@42 332
Chris@42 333 # ifdef HAVE_BSDGETTIMEOFDAY
Chris@42 334 # ifndef HAVE_GETTIMEOFDAY
Chris@42 335 # define gettimeofday BSDgettimeofday
Chris@42 336 # define HAVE_GETTIMEOFDAY 1
Chris@42 337 # endif
Chris@42 338 # endif
Chris@42 339
Chris@42 340 # if defined(HAVE_GETTIMEOFDAY)
Chris@42 341 typedef struct timeval crude_time;
Chris@42 342 # else
Chris@42 343 typedef clock_t crude_time;
Chris@42 344 # endif
Chris@42 345 #endif /* else FAKE_CRUDE_TIME */
Chris@42 346
Chris@42 347 crude_time X(get_crude_time)(void);
Chris@42 348 double X(elapsed_since)(const planner *plnr, const problem *p,
Chris@42 349 crude_time t0); /* time in seconds since t0 */
Chris@42 350
Chris@42 351 /*-----------------------------------------------------------------------*/
Chris@42 352 /* ops.c: */
Chris@42 353 /*
Chris@42 354 * ops counter. The total number of additions is add + fma
Chris@42 355 * and the total number of multiplications is mul + fma.
Chris@42 356 * Total flops = add + mul + 2 * fma
Chris@42 357 */
Chris@42 358 typedef struct {
Chris@42 359 double add;
Chris@42 360 double mul;
Chris@42 361 double fma;
Chris@42 362 double other;
Chris@42 363 } opcnt;
Chris@42 364
Chris@42 365 void X(ops_zero)(opcnt *dst);
Chris@42 366 void X(ops_other)(INT o, opcnt *dst);
Chris@42 367 void X(ops_cpy)(const opcnt *src, opcnt *dst);
Chris@42 368
Chris@42 369 void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
Chris@42 370 void X(ops_add2)(const opcnt *a, opcnt *dst);
Chris@42 371
Chris@42 372 /* dst = m * a + b */
Chris@42 373 void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
Chris@42 374
Chris@42 375 /* dst += m * a */
Chris@42 376 void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
Chris@42 377
Chris@42 378
Chris@42 379 /*-----------------------------------------------------------------------*/
Chris@42 380 /* minmax.c: */
Chris@42 381 INT X(imax)(INT a, INT b);
Chris@42 382 INT X(imin)(INT a, INT b);
Chris@42 383
Chris@42 384 /*-----------------------------------------------------------------------*/
Chris@42 385 /* iabs.c: */
Chris@42 386 INT X(iabs)(INT a);
Chris@42 387
Chris@42 388 /* inline version */
Chris@42 389 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
Chris@42 390
Chris@42 391 /*-----------------------------------------------------------------------*/
Chris@42 392 /* md5.c */
Chris@42 393
Chris@42 394 #if SIZEOF_UNSIGNED_INT >= 4
Chris@42 395 typedef unsigned int md5uint;
Chris@42 396 #else
Chris@42 397 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
Chris@42 398 #endif
Chris@42 399
Chris@42 400 typedef md5uint md5sig[4];
Chris@42 401
Chris@42 402 typedef struct {
Chris@42 403 md5sig s; /* state and signature */
Chris@42 404
Chris@42 405 /* fields not meant to be used outside md5.c: */
Chris@42 406 unsigned char c[64]; /* stuff not yet processed */
Chris@42 407 unsigned l; /* total length. Should be 64 bits long, but this is
Chris@42 408 good enough for us */
Chris@42 409 } md5;
Chris@42 410
Chris@42 411 void X(md5begin)(md5 *p);
Chris@42 412 void X(md5putb)(md5 *p, const void *d_, size_t len);
Chris@42 413 void X(md5puts)(md5 *p, const char *s);
Chris@42 414 void X(md5putc)(md5 *p, unsigned char c);
Chris@42 415 void X(md5int)(md5 *p, int i);
Chris@42 416 void X(md5INT)(md5 *p, INT i);
Chris@42 417 void X(md5unsigned)(md5 *p, unsigned i);
Chris@42 418 void X(md5end)(md5 *p);
Chris@42 419
Chris@42 420 /*-----------------------------------------------------------------------*/
Chris@42 421 /* tensor.c: */
Chris@42 422 #define STRUCT_HACK_KR
Chris@42 423 #undef STRUCT_HACK_C99
Chris@42 424
Chris@42 425 typedef struct {
Chris@42 426 INT n;
Chris@42 427 INT is; /* input stride */
Chris@42 428 INT os; /* output stride */
Chris@42 429 } iodim;
Chris@42 430
Chris@42 431 typedef struct {
Chris@42 432 int rnk;
Chris@42 433 #if defined(STRUCT_HACK_KR)
Chris@42 434 iodim dims[1];
Chris@42 435 #elif defined(STRUCT_HACK_C99)
Chris@42 436 iodim dims[];
Chris@42 437 #else
Chris@42 438 iodim *dims;
Chris@42 439 #endif
Chris@42 440 } tensor;
Chris@42 441
Chris@42 442 /*
Chris@42 443 Definition of rank -infinity.
Chris@42 444 This definition has the property that if you want rank 0 or 1,
Chris@42 445 you can simply test for rank <= 1. This is a common case.
Chris@42 446
Chris@42 447 A tensor of rank -infinity has size 0.
Chris@42 448 */
Chris@42 449 #define RNK_MINFTY INT_MAX
Chris@42 450 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
Chris@42 451
Chris@42 452 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
Chris@42 453
Chris@42 454 tensor *X(mktensor)(int rnk);
Chris@42 455 tensor *X(mktensor_0d)(void);
Chris@42 456 tensor *X(mktensor_1d)(INT n, INT is, INT os);
Chris@42 457 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
Chris@42 458 INT n1, INT is1, INT os1);
Chris@42 459 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
Chris@42 460 INT n1, INT is1, INT os1,
Chris@42 461 INT n2, INT is2, INT os2);
Chris@42 462 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
Chris@42 463 INT n1, INT is1, INT os1,
Chris@42 464 INT n2, INT is2, INT os2,
Chris@42 465 INT n3, INT is3, INT os3);
Chris@42 466 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
Chris@42 467 INT n1, INT is1, INT os1,
Chris@42 468 INT n2, INT is2, INT os2,
Chris@42 469 INT n3, INT is3, INT os3,
Chris@42 470 INT n4, INT is4, INT os4);
Chris@42 471 INT X(tensor_sz)(const tensor *sz);
Chris@42 472 void X(tensor_md5)(md5 *p, const tensor *t);
Chris@42 473 INT X(tensor_max_index)(const tensor *sz);
Chris@42 474 INT X(tensor_min_istride)(const tensor *sz);
Chris@42 475 INT X(tensor_min_ostride)(const tensor *sz);
Chris@42 476 INT X(tensor_min_stride)(const tensor *sz);
Chris@42 477 int X(tensor_inplace_strides)(const tensor *sz);
Chris@42 478 int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
Chris@42 479 int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
Chris@42 480 inplace_kind k);
Chris@42 481 tensor *X(tensor_copy)(const tensor *sz);
Chris@42 482 int X(tensor_kosherp)(const tensor *x);
Chris@42 483
Chris@42 484 tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
Chris@42 485 tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
Chris@42 486 tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
Chris@42 487 tensor *X(tensor_compress)(const tensor *sz);
Chris@42 488 tensor *X(tensor_compress_contiguous)(const tensor *sz);
Chris@42 489 tensor *X(tensor_append)(const tensor *a, const tensor *b);
Chris@42 490 void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
Chris@42 491 int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
Chris@42 492 void X(tensor_destroy)(tensor *sz);
Chris@42 493 void X(tensor_destroy2)(tensor *a, tensor *b);
Chris@42 494 void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
Chris@42 495 void X(tensor_print)(const tensor *sz, printer *p);
Chris@42 496 int X(dimcmp)(const iodim *a, const iodim *b);
Chris@42 497 int X(tensor_equal)(const tensor *a, const tensor *b);
Chris@42 498 int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
Chris@42 499
Chris@42 500 /*-----------------------------------------------------------------------*/
Chris@42 501 /* problem.c: */
Chris@42 502 enum {
Chris@42 503 /* a problem that cannot be solved */
Chris@42 504 PROBLEM_UNSOLVABLE,
Chris@42 505
Chris@42 506 PROBLEM_DFT,
Chris@42 507 PROBLEM_RDFT,
Chris@42 508 PROBLEM_RDFT2,
Chris@42 509
Chris@42 510 /* for mpi/ subdirectory */
Chris@42 511 PROBLEM_MPI_DFT,
Chris@42 512 PROBLEM_MPI_RDFT,
Chris@42 513 PROBLEM_MPI_RDFT2,
Chris@42 514 PROBLEM_MPI_TRANSPOSE,
Chris@42 515
Chris@42 516 PROBLEM_LAST
Chris@42 517 };
Chris@42 518
Chris@42 519 typedef struct {
Chris@42 520 int problem_kind;
Chris@42 521 void (*hash) (const problem *ego, md5 *p);
Chris@42 522 void (*zero) (const problem *ego);
Chris@42 523 void (*print) (const problem *ego, printer *p);
Chris@42 524 void (*destroy) (problem *ego);
Chris@42 525 } problem_adt;
Chris@42 526
Chris@42 527 struct problem_s {
Chris@42 528 const problem_adt *adt;
Chris@42 529 };
Chris@42 530
Chris@42 531 problem *X(mkproblem)(size_t sz, const problem_adt *adt);
Chris@42 532 void X(problem_destroy)(problem *ego);
Chris@42 533 problem *X(mkproblem_unsolvable)(void);
Chris@42 534
Chris@42 535 /*-----------------------------------------------------------------------*/
Chris@42 536 /* print.c */
Chris@42 537 struct printer_s {
Chris@42 538 void (*print)(printer *p, const char *format, ...);
Chris@42 539 void (*vprint)(printer *p, const char *format, va_list ap);
Chris@42 540 void (*putchr)(printer *p, char c);
Chris@42 541 void (*cleanup)(printer *p);
Chris@42 542 int indent;
Chris@42 543 int indent_incr;
Chris@42 544 };
Chris@42 545
Chris@42 546 printer *X(mkprinter)(size_t size,
Chris@42 547 void (*putchr)(printer *p, char c),
Chris@42 548 void (*cleanup)(printer *p));
Chris@42 549 IFFTW_EXTERN void X(printer_destroy)(printer *p);
Chris@42 550
Chris@42 551 /*-----------------------------------------------------------------------*/
Chris@42 552 /* scan.c */
Chris@42 553 struct scanner_s {
Chris@42 554 int (*scan)(scanner *sc, const char *format, ...);
Chris@42 555 int (*vscan)(scanner *sc, const char *format, va_list ap);
Chris@42 556 int (*getchr)(scanner *sc);
Chris@42 557 int ungotc;
Chris@42 558 };
Chris@42 559
Chris@42 560 scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
Chris@42 561 void X(scanner_destroy)(scanner *sc);
Chris@42 562
Chris@42 563 /*-----------------------------------------------------------------------*/
Chris@42 564 /* plan.c: */
Chris@42 565
Chris@42 566 enum wakefulness {
Chris@42 567 SLEEPY,
Chris@42 568 AWAKE_ZERO,
Chris@42 569 AWAKE_SQRTN_TABLE,
Chris@42 570 AWAKE_SINCOS
Chris@42 571 };
Chris@42 572
Chris@42 573 typedef struct {
Chris@42 574 void (*solve)(const plan *ego, const problem *p);
Chris@42 575 void (*awake)(plan *ego, enum wakefulness wakefulness);
Chris@42 576 void (*print)(const plan *ego, printer *p);
Chris@42 577 void (*destroy)(plan *ego);
Chris@42 578 } plan_adt;
Chris@42 579
Chris@42 580 struct plan_s {
Chris@42 581 const plan_adt *adt;
Chris@42 582 opcnt ops;
Chris@42 583 double pcost;
Chris@42 584 enum wakefulness wakefulness; /* used for debugging only */
Chris@42 585 int could_prune_now_p;
Chris@42 586 };
Chris@42 587
Chris@42 588 plan *X(mkplan)(size_t size, const plan_adt *adt);
Chris@42 589 void X(plan_destroy_internal)(plan *ego);
Chris@42 590 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
Chris@42 591 void X(plan_null_destroy)(plan *ego);
Chris@42 592
Chris@42 593 /*-----------------------------------------------------------------------*/
Chris@42 594 /* solver.c: */
Chris@42 595 typedef struct {
Chris@42 596 int problem_kind;
Chris@42 597 plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
Chris@42 598 void (*destroy)(solver *ego);
Chris@42 599 } solver_adt;
Chris@42 600
Chris@42 601 struct solver_s {
Chris@42 602 const solver_adt *adt;
Chris@42 603 int refcnt;
Chris@42 604 };
Chris@42 605
Chris@42 606 solver *X(mksolver)(size_t size, const solver_adt *adt);
Chris@42 607 void X(solver_use)(solver *ego);
Chris@42 608 void X(solver_destroy)(solver *ego);
Chris@42 609 void X(solver_register)(planner *plnr, solver *s);
Chris@42 610
Chris@42 611 /* shorthand */
Chris@42 612 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
Chris@42 613
Chris@42 614 /*-----------------------------------------------------------------------*/
Chris@42 615 /* planner.c */
Chris@42 616
Chris@42 617 typedef struct slvdesc_s {
Chris@42 618 solver *slv;
Chris@42 619 const char *reg_nam;
Chris@42 620 unsigned nam_hash;
Chris@42 621 int reg_id;
Chris@42 622 int next_for_same_problem_kind;
Chris@42 623 } slvdesc;
Chris@42 624
Chris@42 625 typedef struct solution_s solution; /* opaque */
Chris@42 626
Chris@42 627 /* interpretation of L and U:
Chris@42 628
Chris@42 629 - if it returns a plan, the planner guarantees that all applicable
Chris@42 630 plans at least as impatient as U have been tried, and that each
Chris@42 631 plan in the solution is at least as impatient as L.
Chris@42 632
Chris@42 633 - if it returns 0, the planner guarantees to have tried all solvers
Chris@42 634 at least as impatient as L, and that none of them was applicable.
Chris@42 635
Chris@42 636 The structure is packed to fit into 64 bits.
Chris@42 637 */
Chris@42 638
Chris@42 639 typedef struct {
Chris@42 640 unsigned l:20;
Chris@42 641 unsigned hash_info:3;
Chris@42 642 # define BITS_FOR_TIMELIMIT 9
Chris@42 643 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
Chris@42 644 unsigned u:20;
Chris@42 645
Chris@42 646 /* abstraction break: we store the solver here to pad the
Chris@42 647 structure to 64 bits. Otherwise, the struct is padded to 64
Chris@42 648 bits anyway, and another word is allocated for slvndx. */
Chris@42 649 # define BITS_FOR_SLVNDX 12
Chris@42 650 unsigned slvndx:BITS_FOR_SLVNDX;
Chris@42 651 } flags_t;
Chris@42 652
Chris@42 653 /* impatience flags */
Chris@42 654 enum {
Chris@42 655 BELIEVE_PCOST = 0x0001,
Chris@42 656 ESTIMATE = 0x0002,
Chris@42 657 NO_DFT_R2HC = 0x0004,
Chris@42 658 NO_SLOW = 0x0008,
Chris@42 659 NO_VRECURSE = 0x0010,
Chris@42 660 NO_INDIRECT_OP = 0x0020,
Chris@42 661 NO_LARGE_GENERIC = 0x0040,
Chris@42 662 NO_RANK_SPLITS = 0x0080,
Chris@42 663 NO_VRANK_SPLITS = 0x0100,
Chris@42 664 NO_NONTHREADED = 0x0200,
Chris@42 665 NO_BUFFERING = 0x0400,
Chris@42 666 NO_FIXED_RADIX_LARGE_N = 0x0800,
Chris@42 667 NO_DESTROY_INPUT = 0x1000,
Chris@42 668 NO_SIMD = 0x2000,
Chris@42 669 CONSERVE_MEMORY = 0x4000,
Chris@42 670 NO_DHT_R2HC = 0x8000,
Chris@42 671 NO_UGLY = 0x10000,
Chris@42 672 ALLOW_PRUNING = 0x20000
Chris@42 673 };
Chris@42 674
Chris@42 675 /* hashtable information */
Chris@42 676 enum {
Chris@42 677 BLESSING = 0x1u, /* save this entry */
Chris@42 678 H_VALID = 0x2u, /* valid hastable entry */
Chris@42 679 H_LIVE = 0x4u /* entry is nonempty, implies H_VALID */
Chris@42 680 };
Chris@42 681
Chris@42 682 #define PLNR_L(plnr) ((plnr)->flags.l)
Chris@42 683 #define PLNR_U(plnr) ((plnr)->flags.u)
Chris@42 684 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
Chris@42 685
Chris@42 686 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
Chris@42 687 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
Chris@42 688 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
Chris@42 689
Chris@42 690 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
Chris@42 691 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
Chris@42 692 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
Chris@42 693 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
Chris@42 694 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
Chris@42 695 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
Chris@42 696 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
Chris@42 697 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
Chris@42 698 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
Chris@42 699 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
Chris@42 700 #define NO_NONTHREADEDP(plnr) \
Chris@42 701 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
Chris@42 702
Chris@42 703 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
Chris@42 704 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
Chris@42 705 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
Chris@42 706 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
Chris@42 707 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
Chris@42 708
Chris@42 709 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
Chris@42 710
Chris@42 711 typedef enum {
Chris@42 712 /* WISDOM_NORMAL: planner may or may not use wisdom */
Chris@42 713 WISDOM_NORMAL,
Chris@42 714
Chris@42 715 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
Chris@42 716 WISDOM_ONLY,
Chris@42 717
Chris@42 718 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
Chris@42 719 WISDOM_IS_BOGUS,
Chris@42 720
Chris@42 721 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
Chris@42 722 WISDOM_IGNORE_INFEASIBLE,
Chris@42 723
Chris@42 724 /* WISDOM_IGNORE_ALL: planner ignores all */
Chris@42 725 WISDOM_IGNORE_ALL
Chris@42 726 } wisdom_state_t;
Chris@42 727
Chris@42 728 typedef struct {
Chris@42 729 void (*register_solver)(planner *ego, solver *s);
Chris@42 730 plan *(*mkplan)(planner *ego, const problem *p);
Chris@42 731 void (*forget)(planner *ego, amnesia a);
Chris@42 732 void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
Chris@42 733 word in C++. */
Chris@42 734 int (*imprt)(planner *ego, scanner *sc);
Chris@42 735 } planner_adt;
Chris@42 736
Chris@42 737 /* hash table of solutions */
Chris@42 738 typedef struct {
Chris@42 739 solution *solutions;
Chris@42 740 unsigned hashsiz, nelem;
Chris@42 741
Chris@42 742 /* statistics */
Chris@42 743 int lookup, succ_lookup, lookup_iter;
Chris@42 744 int insert, insert_iter, insert_unknown;
Chris@42 745 int nrehash;
Chris@42 746 } hashtab;
Chris@42 747
Chris@42 748 typedef enum { COST_SUM, COST_MAX } cost_kind;
Chris@42 749
Chris@42 750 struct planner_s {
Chris@42 751 const planner_adt *adt;
Chris@42 752 void (*hook)(struct planner_s *plnr, plan *pln,
Chris@42 753 const problem *p, int optimalp);
Chris@42 754 double (*cost_hook)(const problem *p, double t, cost_kind k);
Chris@42 755 int (*wisdom_ok_hook)(const problem *p, flags_t flags);
Chris@42 756 void (*nowisdom_hook)(const problem *p);
Chris@42 757 wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
Chris@42 758
Chris@42 759 /* solver descriptors */
Chris@42 760 slvdesc *slvdescs;
Chris@42 761 unsigned nslvdesc, slvdescsiz;
Chris@42 762 const char *cur_reg_nam;
Chris@42 763 int cur_reg_id;
Chris@42 764 int slvdescs_for_problem_kind[PROBLEM_LAST];
Chris@42 765
Chris@42 766 wisdom_state_t wisdom_state;
Chris@42 767
Chris@42 768 hashtab htab_blessed;
Chris@42 769 hashtab htab_unblessed;
Chris@42 770
Chris@42 771 int nthr;
Chris@42 772 flags_t flags;
Chris@42 773
Chris@42 774 crude_time start_time;
Chris@42 775 double timelimit; /* elapsed_since(start_time) at which to bail out */
Chris@42 776 int timed_out; /* whether most recent search timed out */
Chris@42 777 int need_timeout_check;
Chris@42 778
Chris@42 779 /* various statistics */
Chris@42 780 int nplan; /* number of plans evaluated */
Chris@42 781 double pcost, epcost; /* total pcost of measured/estimated plans */
Chris@42 782 int nprob; /* number of problems evaluated */
Chris@42 783 };
Chris@42 784
Chris@42 785 planner *X(mkplanner)(void);
Chris@42 786 void X(planner_destroy)(planner *ego);
Chris@42 787
Chris@42 788 /*
Chris@42 789 Iterate over all solvers. Read:
Chris@42 790
Chris@42 791 @article{ baker93iterators,
Chris@42 792 author = "Henry G. Baker, Jr.",
Chris@42 793 title = "Iterators: Signs of Weakness in Object-Oriented Languages",
Chris@42 794 journal = "{ACM} {OOPS} Messenger",
Chris@42 795 volume = "4",
Chris@42 796 number = "3",
Chris@42 797 pages = "18--25"
Chris@42 798 }
Chris@42 799 */
Chris@42 800 #define FORALL_SOLVERS(ego, s, p, what) \
Chris@42 801 { \
Chris@42 802 unsigned _cnt; \
Chris@42 803 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
Chris@42 804 slvdesc *p = ego->slvdescs + _cnt; \
Chris@42 805 solver *s = p->slv; \
Chris@42 806 what; \
Chris@42 807 } \
Chris@42 808 }
Chris@42 809
Chris@42 810 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
Chris@42 811 { \
Chris@42 812 int _cnt = ego->slvdescs_for_problem_kind[kind]; \
Chris@42 813 while (_cnt >= 0) { \
Chris@42 814 slvdesc *p = ego->slvdescs + _cnt; \
Chris@42 815 solver *s = p->slv; \
Chris@42 816 what; \
Chris@42 817 _cnt = p->next_for_same_problem_kind; \
Chris@42 818 } \
Chris@42 819 }
Chris@42 820
Chris@42 821
Chris@42 822 /* make plan, destroy problem */
Chris@42 823 plan *X(mkplan_d)(planner *ego, problem *p);
Chris@42 824 plan *X(mkplan_f_d)(planner *ego, problem *p,
Chris@42 825 unsigned l_set, unsigned u_set, unsigned u_reset);
Chris@42 826
Chris@42 827 /*-----------------------------------------------------------------------*/
Chris@42 828 /* stride.c: */
Chris@42 829
Chris@42 830 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
Chris@42 831 #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
Chris@42 832 #define PRECOMPUTE_ARRAY_INDICES
Chris@42 833 #endif
Chris@42 834
Chris@42 835 extern const INT X(an_INT_guaranteed_to_be_zero);
Chris@42 836
Chris@42 837 #ifdef PRECOMPUTE_ARRAY_INDICES
Chris@42 838 typedef INT *stride;
Chris@42 839 #define WS(stride, i) (stride[i])
Chris@42 840 extern stride X(mkstride)(INT n, INT s);
Chris@42 841 void X(stride_destroy)(stride p);
Chris@42 842 /* hackery to prevent the compiler from copying the strides array
Chris@42 843 onto the stack */
Chris@42 844 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
Chris@42 845 #else
Chris@42 846
Chris@42 847 typedef INT stride;
Chris@42 848 #define WS(stride, i) (stride * i)
Chris@42 849 #define fftwf_mkstride(n, stride) stride
Chris@42 850 #define fftw_mkstride(n, stride) stride
Chris@42 851 #define fftwl_mkstride(n, stride) stride
Chris@42 852 #define fftwf_stride_destroy(p) ((void) p)
Chris@42 853 #define fftw_stride_destroy(p) ((void) p)
Chris@42 854 #define fftwl_stride_destroy(p) ((void) p)
Chris@42 855
Chris@42 856 /* hackery to prevent the compiler from ``optimizing'' induction
Chris@42 857 variables in codelet loops. The problem is that for each K and for
Chris@42 858 each expression of the form P[I + STRIDE * K] in a loop, most
Chris@42 859 compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
Chris@42 860 For large values of K this behavior overflows the
Chris@42 861 register set, which is likely worse than doing the index computation
Chris@42 862 in the first place.
Chris@42 863
Chris@42 864 If we guess that there are more than
Chris@42 865 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
Chris@42 866 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
Chris@42 867 be 0, but the compiler does not know this.
Chris@42 868
Chris@42 869 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
Chris@42 870 seem to imply.
Chris@42 871 */
Chris@42 872 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
Chris@42 873 #define MAKE_VOLATILE_STRIDE(nptr, x) \
Chris@42 874 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
Chris@42 875 0 : \
Chris@42 876 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
Chris@42 877 #endif /* PRECOMPUTE_ARRAY_INDICES */
Chris@42 878
Chris@42 879 /*-----------------------------------------------------------------------*/
Chris@42 880 /* solvtab.c */
Chris@42 881
Chris@42 882 struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
Chris@42 883 typedef struct solvtab_s solvtab[];
Chris@42 884 void X(solvtab_exec)(const solvtab tbl, planner *p);
Chris@42 885 #define SOLVTAB(s) { s, STRINGIZE(s) }
Chris@42 886 #define SOLVTAB_END { 0, 0 }
Chris@42 887
Chris@42 888 /*-----------------------------------------------------------------------*/
Chris@42 889 /* pickdim.c */
Chris@42 890 int X(pickdim)(int which_dim, const int *buddies, size_t nbuddies,
Chris@42 891 const tensor *sz, int oop, int *dp);
Chris@42 892
Chris@42 893 /*-----------------------------------------------------------------------*/
Chris@42 894 /* twiddle.c */
Chris@42 895 /* little language to express twiddle factors computation */
Chris@42 896 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
Chris@42 897 TW_FULL = 4, TW_HALF = 5 };
Chris@42 898
Chris@42 899 typedef struct {
Chris@42 900 unsigned char op;
Chris@42 901 signed char v;
Chris@42 902 short i;
Chris@42 903 } tw_instr;
Chris@42 904
Chris@42 905 typedef struct twid_s {
Chris@42 906 R *W; /* array of twiddle factors */
Chris@42 907 INT n, r, m; /* transform order, radix, # twiddle rows */
Chris@42 908 int refcnt;
Chris@42 909 const tw_instr *instr;
Chris@42 910 struct twid_s *cdr;
Chris@42 911 enum wakefulness wakefulness;
Chris@42 912 } twid;
Chris@42 913
Chris@42 914 INT X(twiddle_length)(INT r, const tw_instr *p);
Chris@42 915 void X(twiddle_awake)(enum wakefulness wakefulness,
Chris@42 916 twid **pp, const tw_instr *instr, INT n, INT r, INT m);
Chris@42 917
Chris@42 918 /*-----------------------------------------------------------------------*/
Chris@42 919 /* trig.c */
Chris@42 920 #if defined(TRIGREAL_IS_LONG_DOUBLE)
Chris@42 921 typedef long double trigreal;
Chris@42 922 #elif defined(TRIGREAL_IS_QUAD)
Chris@42 923 typedef __float128 trigreal;
Chris@42 924 #else
Chris@42 925 typedef double trigreal;
Chris@42 926 #endif
Chris@42 927
Chris@42 928 typedef struct triggen_s triggen;
Chris@42 929
Chris@42 930 struct triggen_s {
Chris@42 931 void (*cexp)(triggen *t, INT m, R *result);
Chris@42 932 void (*cexpl)(triggen *t, INT m, trigreal *result);
Chris@42 933 void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
Chris@42 934
Chris@42 935 INT twshft;
Chris@42 936 INT twradix;
Chris@42 937 INT twmsk;
Chris@42 938 trigreal *W0, *W1;
Chris@42 939 INT n;
Chris@42 940 };
Chris@42 941
Chris@42 942 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
Chris@42 943 void X(triggen_destroy)(triggen *p);
Chris@42 944
Chris@42 945 /*-----------------------------------------------------------------------*/
Chris@42 946 /* primes.c: */
Chris@42 947
Chris@42 948 #define MULMOD(x, y, p) \
Chris@42 949 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
Chris@42 950
Chris@42 951 INT X(safe_mulmod)(INT x, INT y, INT p);
Chris@42 952 INT X(power_mod)(INT n, INT m, INT p);
Chris@42 953 INT X(find_generator)(INT p);
Chris@42 954 INT X(first_divisor)(INT n);
Chris@42 955 int X(is_prime)(INT n);
Chris@42 956 INT X(next_prime)(INT n);
Chris@42 957 int X(factors_into)(INT n, const INT *primes);
Chris@42 958 int X(factors_into_small_primes)(INT n);
Chris@42 959 INT X(choose_radix)(INT r, INT n);
Chris@42 960 INT X(isqrt)(INT n);
Chris@42 961 INT X(modulo)(INT a, INT n);
Chris@42 962
Chris@42 963 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
Chris@42 964
Chris@42 965 /* thresholds below which certain solvers are considered SLOW. These are guesses
Chris@42 966 believed to be conservative */
Chris@42 967 #define GENERIC_MAX_SLOW 16
Chris@42 968 #define RADER_MAX_SLOW 32
Chris@42 969 #define BLUESTEIN_MAX_SLOW 24
Chris@42 970
Chris@42 971 /*-----------------------------------------------------------------------*/
Chris@42 972 /* rader.c: */
Chris@42 973 typedef struct rader_tls rader_tl;
Chris@42 974
Chris@42 975 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
Chris@42 976 R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
Chris@42 977 void X(rader_tl_delete)(R *W, rader_tl **tl);
Chris@42 978
Chris@42 979 /*-----------------------------------------------------------------------*/
Chris@42 980 /* copy/transposition routines */
Chris@42 981
Chris@42 982 /* lower bound to the cache size, for tiled routines */
Chris@42 983 #define CACHESIZE 8192
Chris@42 984
Chris@42 985 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
Chris@42 986
Chris@42 987 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
Chris@42 988 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
Chris@42 989 void *args);
Chris@42 990 void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
Chris@42 991 void X(zero1d_pair)(R *O0, R *O1, INT n0, INT os0);
Chris@42 992 void X(cpy2d)(R *I, R *O,
Chris@42 993 INT n0, INT is0, INT os0,
Chris@42 994 INT n1, INT is1, INT os1,
Chris@42 995 INT vl);
Chris@42 996 void X(cpy2d_ci)(R *I, R *O,
Chris@42 997 INT n0, INT is0, INT os0,
Chris@42 998 INT n1, INT is1, INT os1,
Chris@42 999 INT vl);
Chris@42 1000 void X(cpy2d_co)(R *I, R *O,
Chris@42 1001 INT n0, INT is0, INT os0,
Chris@42 1002 INT n1, INT is1, INT os1,
Chris@42 1003 INT vl);
Chris@42 1004 void X(cpy2d_tiled)(R *I, R *O,
Chris@42 1005 INT n0, INT is0, INT os0,
Chris@42 1006 INT n1, INT is1, INT os1,
Chris@42 1007 INT vl);
Chris@42 1008 void X(cpy2d_tiledbuf)(R *I, R *O,
Chris@42 1009 INT n0, INT is0, INT os0,
Chris@42 1010 INT n1, INT is1, INT os1,
Chris@42 1011 INT vl);
Chris@42 1012 void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
Chris@42 1013 INT n0, INT is0, INT os0,
Chris@42 1014 INT n1, INT is1, INT os1);
Chris@42 1015 void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
Chris@42 1016 INT n0, INT is0, INT os0,
Chris@42 1017 INT n1, INT is1, INT os1);
Chris@42 1018 void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
Chris@42 1019 INT n0, INT is0, INT os0,
Chris@42 1020 INT n1, INT is1, INT os1);
Chris@42 1021
Chris@42 1022 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42 1023 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42 1024 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42 1025
Chris@42 1026 typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42 1027 typedef void (*cpy2d_func)(R *I, R *O,
Chris@42 1028 INT n0, INT is0, INT os0,
Chris@42 1029 INT n1, INT is1, INT os1,
Chris@42 1030 INT vl);
Chris@42 1031
Chris@42 1032 /*-----------------------------------------------------------------------*/
Chris@42 1033 /* misc stuff */
Chris@42 1034 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
Chris@42 1035 double X(iestimate_cost)(const planner *, const plan *, const problem *);
Chris@42 1036
Chris@42 1037 #ifdef FFTW_RANDOM_ESTIMATOR
Chris@42 1038 extern unsigned X(random_estimate_seed);
Chris@42 1039 #endif
Chris@42 1040
Chris@42 1041 double X(measure_execution_time)(const planner *plnr,
Chris@42 1042 plan *pln, const problem *p);
Chris@42 1043 IFFTW_EXTERN int X(ialignment_of)(R *p);
Chris@42 1044 unsigned X(hash)(const char *s);
Chris@42 1045 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
Chris@42 1046 int X(nbuf_redundant)(INT n, INT vl, size_t which,
Chris@42 1047 const INT *maxnbuf, size_t nmaxnbuf);
Chris@42 1048 INT X(bufdist)(INT n, INT vl);
Chris@42 1049 int X(toobig)(INT n);
Chris@42 1050 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
Chris@42 1051
Chris@42 1052 #if HAVE_SIMD
Chris@42 1053 R *X(taint)(R *p, INT s);
Chris@42 1054 R *X(join_taint)(R *p1, R *p2);
Chris@42 1055 #define TAINT(p, s) X(taint)(p, s)
Chris@42 1056 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
Chris@42 1057 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
Chris@42 1058 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
Chris@42 1059 #else
Chris@42 1060 #define TAINT(p, s) (p)
Chris@42 1061 #define UNTAINT(p) (p)
Chris@42 1062 #define TAINTOF(p) 0
Chris@42 1063 #define JOIN_TAINT(p1, p2) p1
Chris@42 1064 #endif
Chris@42 1065
Chris@42 1066 #ifdef FFTW_DEBUG_ALIGNMENT
Chris@42 1067 # define ASSERT_ALIGNED_DOUBLE { \
Chris@42 1068 double __foo; \
Chris@42 1069 CK(!(((uintptr_t) &__foo) & 0x7)); \
Chris@42 1070 }
Chris@42 1071 #else
Chris@42 1072 # define ASSERT_ALIGNED_DOUBLE
Chris@42 1073 #endif /* FFTW_DEBUG_ALIGNMENT */
Chris@42 1074
Chris@42 1075
Chris@42 1076
Chris@42 1077 /*-----------------------------------------------------------------------*/
Chris@42 1078 /* macros used in codelets to reduce source code size */
Chris@42 1079
Chris@42 1080 typedef R E; /* internal precision of codelets. */
Chris@42 1081
Chris@42 1082 #if defined(FFTW_LDOUBLE)
Chris@42 1083 # define K(x) ((E) x##L)
Chris@42 1084 #elif defined(FFTW_QUAD)
Chris@42 1085 # define K(x) ((E) x##Q)
Chris@42 1086 #else
Chris@42 1087 # define K(x) ((E) x)
Chris@42 1088 #endif
Chris@42 1089 #define DK(name, value) const E name = K(value)
Chris@42 1090
Chris@42 1091 /* FMA macros */
Chris@42 1092
Chris@42 1093 #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
Chris@42 1094 /* The obvious expression a * b + c does not work. If both x = a * b
Chris@42 1095 + c and y = a * b - c appear in the source, gcc computes t = a * b,
Chris@42 1096 x = t + c, y = t - c, thus destroying the fma.
Chris@42 1097
Chris@42 1098 This peculiar coding seems to do the right thing on all of
Chris@42 1099 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
Chris@42 1100 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
Chris@42 1101 `x' for the single-assignment form).
Chris@42 1102
Chris@42 1103 However, gcc-4.0 is a formidable adversary which succeeds in
Chris@42 1104 pessimizing two fma's into one multiplication and two additions.
Chris@42 1105 It does it very early in the game---before the optimization passes
Chris@42 1106 even start. The only real workaround seems to use fake inline asm
Chris@42 1107 such as
Chris@42 1108
Chris@42 1109 asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
Chris@42 1110 return a * b + c;
Chris@42 1111
Chris@42 1112 in each of the FMA, FMS, FNMA, and FNMS functions. However, this
Chris@42 1113 does not solve the problem either, because two equal asm statements
Chris@42 1114 count as a common subexpression! One must use *different* fake asm
Chris@42 1115 statements:
Chris@42 1116
Chris@42 1117 in FMA:
Chris@42 1118 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
Chris@42 1119
Chris@42 1120 in FMS:
Chris@42 1121 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
Chris@42 1122
Chris@42 1123 etc.
Chris@42 1124
Chris@42 1125 After these changes, gcc recalcitrantly generates the fma that was
Chris@42 1126 in the source to begin with. However, the extra asm() cruft
Chris@42 1127 confuses other passes of gcc, notably the instruction scheduler.
Chris@42 1128 (Of course, one could also generate the fma directly via inline
Chris@42 1129 asm, but this confuses the scheduler even more.)
Chris@42 1130
Chris@42 1131 Steven and I have submitted more than one bug report to the gcc
Chris@42 1132 mailing list over the past few years, to no effect. Thus, I give
Chris@42 1133 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
Chris@42 1134 out before touching this crap again.
Chris@42 1135 */
Chris@42 1136 static __inline__ E FMA(E a, E b, E c)
Chris@42 1137 {
Chris@42 1138 E x = a * b;
Chris@42 1139 x = x + c;
Chris@42 1140 return x;
Chris@42 1141 }
Chris@42 1142
Chris@42 1143 static __inline__ E FMS(E a, E b, E c)
Chris@42 1144 {
Chris@42 1145 E x = a * b;
Chris@42 1146 x = x - c;
Chris@42 1147 return x;
Chris@42 1148 }
Chris@42 1149
Chris@42 1150 static __inline__ E FNMA(E a, E b, E c)
Chris@42 1151 {
Chris@42 1152 E x = a * b;
Chris@42 1153 x = - (x + c);
Chris@42 1154 return x;
Chris@42 1155 }
Chris@42 1156
Chris@42 1157 static __inline__ E FNMS(E a, E b, E c)
Chris@42 1158 {
Chris@42 1159 E x = a * b;
Chris@42 1160 x = - (x - c);
Chris@42 1161 return x;
Chris@42 1162 }
Chris@42 1163 #else
Chris@42 1164 #define FMA(a, b, c) (((a) * (b)) + (c))
Chris@42 1165 #define FMS(a, b, c) (((a) * (b)) - (c))
Chris@42 1166 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
Chris@42 1167 #define FNMS(a, b, c) ((c) - ((a) * (b)))
Chris@42 1168 #endif
Chris@42 1169
Chris@42 1170 #ifdef __cplusplus
Chris@42 1171 } /* extern "C" */
Chris@42 1172 #endif /* __cplusplus */
Chris@42 1173
Chris@42 1174 #endif /* __IFFTW_H__ */