annotate src/fftw-3.3.5/kernel/ifftw.h @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21
Chris@42 22 /* FFTW internal header file */
Chris@42 23 #ifndef __IFFTW_H__
Chris@42 24 #define __IFFTW_H__
Chris@42 25
Chris@42 26 #include "config.h"
Chris@42 27
Chris@42 28 #include <stdlib.h> /* size_t */
Chris@42 29 #include <stdarg.h> /* va_list */
Chris@42 30 #include <stddef.h> /* ptrdiff_t */
Chris@42 31 #include <limits.h> /* INT_MAX */
Chris@42 32
Chris@42 33 #if HAVE_SYS_TYPES_H
Chris@42 34 # include <sys/types.h>
Chris@42 35 #endif
Chris@42 36
Chris@42 37 #if HAVE_STDINT_H
Chris@42 38 # include <stdint.h> /* uintptr_t, maybe */
Chris@42 39 #endif
Chris@42 40
Chris@42 41 #if HAVE_INTTYPES_H
Chris@42 42 # include <inttypes.h> /* uintptr_t, maybe */
Chris@42 43 #endif
Chris@42 44
Chris@42 45 #ifdef __cplusplus
Chris@42 46 extern "C"
Chris@42 47 {
Chris@42 48 #endif /* __cplusplus */
Chris@42 49
Chris@42 50 /* Windows annoyances -- since tests/hook.c uses some internal
Chris@42 51 FFTW functions, we need to given them the dllexport attribute
Chris@42 52 under Windows when compiling as a DLL (see api/fftw3.h). */
Chris@42 53 #if defined(FFTW_EXTERN)
Chris@42 54 # define IFFTW_EXTERN FFTW_EXTERN
Chris@42 55 #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
Chris@42 56 && (defined(_WIN32) || defined(__WIN32__))
Chris@42 57 # define IFFTW_EXTERN extern __declspec(dllexport)
Chris@42 58 #else
Chris@42 59 # define IFFTW_EXTERN extern
Chris@42 60 #endif
Chris@42 61
Chris@42 62 /* determine precision and name-mangling scheme */
Chris@42 63 #define CONCAT(prefix, name) prefix ## name
Chris@42 64 #if defined(FFTW_SINGLE)
Chris@42 65 typedef float R;
Chris@42 66 # define X(name) CONCAT(fftwf_, name)
Chris@42 67 #elif defined(FFTW_LDOUBLE)
Chris@42 68 typedef long double R;
Chris@42 69 # define X(name) CONCAT(fftwl_, name)
Chris@42 70 # define TRIGREAL_IS_LONG_DOUBLE
Chris@42 71 #elif defined(FFTW_QUAD)
Chris@42 72 typedef __float128 R;
Chris@42 73 # define X(name) CONCAT(fftwq_, name)
Chris@42 74 # define TRIGREAL_IS_QUAD
Chris@42 75 #else
Chris@42 76 typedef double R;
Chris@42 77 # define X(name) CONCAT(fftw_, name)
Chris@42 78 #endif
Chris@42 79
Chris@42 80 /*
Chris@42 81 integral type large enough to contain a stride (what ``int'' should
Chris@42 82 have been in the first place.
Chris@42 83 */
Chris@42 84 typedef ptrdiff_t INT;
Chris@42 85
Chris@42 86 /* dummy use of unused parameters to silence compiler warnings */
Chris@42 87 #define UNUSED(x) (void)x
Chris@42 88
Chris@42 89 #define NELEM(array) ((sizeof(array) / sizeof((array)[0])))
Chris@42 90
Chris@42 91 #define FFT_SIGN (-1) /* sign convention for forward transforms */
Chris@42 92 extern void X(extract_reim)(int sign, R *c, R **r, R **i);
Chris@42 93
Chris@42 94 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
Chris@42 95
Chris@42 96 #define STRINGIZEx(x) #x
Chris@42 97 #define STRINGIZE(x) STRINGIZEx(x)
Chris@42 98 #define CIMPLIES(ante, post) (!(ante) || (post))
Chris@42 99
Chris@42 100 /* define HAVE_SIMD if any simd extensions are supported */
Chris@42 101 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || \
Chris@42 102 defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \
Chris@42 103 defined(HAVE_AVX2) || defined(HAVE_AVX512) || \
Chris@42 104 defined(HAVE_KCVI) || \
Chris@42 105 defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \
Chris@42 106 defined(HAVE_MIPS_PS) || \
Chris@42 107 defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256)
Chris@42 108 #define HAVE_SIMD 1
Chris@42 109 #else
Chris@42 110 #define HAVE_SIMD 0
Chris@42 111 #endif
Chris@42 112
Chris@42 113 extern int X(have_simd_sse2)(void);
Chris@42 114 extern int X(have_simd_avx)(void);
Chris@42 115 extern int X(have_simd_avx_128_fma)(void);
Chris@42 116 extern int X(have_simd_avx2)(void);
Chris@42 117 extern int X(have_simd_avx2_128)(void);
Chris@42 118 extern int X(have_simd_avx512)(void);
Chris@42 119 extern int X(have_simd_altivec)(void);
Chris@42 120 extern int X(have_simd_vsx)(void);
Chris@42 121 extern int X(have_simd_neon)(void);
Chris@42 122
Chris@42 123 /* forward declarations */
Chris@42 124 typedef struct problem_s problem;
Chris@42 125 typedef struct plan_s plan;
Chris@42 126 typedef struct solver_s solver;
Chris@42 127 typedef struct planner_s planner;
Chris@42 128 typedef struct printer_s printer;
Chris@42 129 typedef struct scanner_s scanner;
Chris@42 130
Chris@42 131 /*-----------------------------------------------------------------------*/
Chris@42 132 /* alloca: */
Chris@42 133 #if HAVE_SIMD
Chris@42 134 # if defined(HAVE_KCVI) || defined(HAVE_AVX512)
Chris@42 135 # define MIN_ALIGNMENT 64
Chris@42 136 # elif defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_GENERIC_SIMD256)
Chris@42 137 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
Chris@42 138 * everything else */
Chris@42 139 # else
Chris@42 140 /* Note that we cannot use 32-byte alignment for all SIMD. For
Chris@42 141 example, MacOS X malloc is 16-byte aligned, but there was no
Chris@42 142 posix_memalign in MacOS X until version 10.6. */
Chris@42 143 # define MIN_ALIGNMENT 16
Chris@42 144 # endif
Chris@42 145 #endif
Chris@42 146
Chris@42 147 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
Chris@42 148 /* use alloca if available */
Chris@42 149
Chris@42 150 #ifndef alloca
Chris@42 151 #ifdef __GNUC__
Chris@42 152 # define alloca __builtin_alloca
Chris@42 153 #else
Chris@42 154 # ifdef _MSC_VER
Chris@42 155 # include <malloc.h>
Chris@42 156 # define alloca _alloca
Chris@42 157 # else
Chris@42 158 # if HAVE_ALLOCA_H
Chris@42 159 # include <alloca.h>
Chris@42 160 # else
Chris@42 161 # ifdef _AIX
Chris@42 162 #pragma alloca
Chris@42 163 # else
Chris@42 164 # ifndef alloca /* predefined by HP cc +Olibcalls */
Chris@42 165 void *alloca(size_t);
Chris@42 166 # endif
Chris@42 167 # endif
Chris@42 168 # endif
Chris@42 169 # endif
Chris@42 170 #endif
Chris@42 171 #endif
Chris@42 172
Chris@42 173 # ifdef MIN_ALIGNMENT
Chris@42 174 # define STACK_MALLOC(T, p, n) \
Chris@42 175 { \
Chris@42 176 p = (T)alloca((n) + MIN_ALIGNMENT); \
Chris@42 177 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
Chris@42 178 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \
Chris@42 179 }
Chris@42 180 # define STACK_FREE(n)
Chris@42 181 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
Chris@42 182 # define STACK_MALLOC(T, p, n) p = (T)alloca(n)
Chris@42 183 # define STACK_FREE(n)
Chris@42 184 # endif
Chris@42 185
Chris@42 186 #else /* ! HAVE_ALLOCA */
Chris@42 187 /* use malloc instead of alloca */
Chris@42 188 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
Chris@42 189 # define STACK_FREE(n) X(ifree)(n)
Chris@42 190 #endif /* ! HAVE_ALLOCA */
Chris@42 191
Chris@42 192 /* allocation of buffers. If these grow too large use malloc(), else
Chris@42 193 use STACK_MALLOC (hopefully reducing to alloca()). */
Chris@42 194
Chris@42 195 /* 64KiB ought to be enough for anybody */
Chris@42 196 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
Chris@42 197
Chris@42 198 #define BUF_ALLOC(T, p, n) \
Chris@42 199 { \
Chris@42 200 if (n < MAX_STACK_ALLOC) { \
Chris@42 201 STACK_MALLOC(T, p, n); \
Chris@42 202 } else { \
Chris@42 203 p = (T)MALLOC(n, BUFFERS); \
Chris@42 204 } \
Chris@42 205 }
Chris@42 206
Chris@42 207 #define BUF_FREE(p, n) \
Chris@42 208 { \
Chris@42 209 if (n < MAX_STACK_ALLOC) { \
Chris@42 210 STACK_FREE(p); \
Chris@42 211 } else { \
Chris@42 212 X(ifree)(p); \
Chris@42 213 } \
Chris@42 214 }
Chris@42 215
Chris@42 216 /*-----------------------------------------------------------------------*/
Chris@42 217 /* define uintptr_t if it is not already defined */
Chris@42 218
Chris@42 219 #ifndef HAVE_UINTPTR_T
Chris@42 220 # if SIZEOF_VOID_P == 0
Chris@42 221 # error sizeof void* is unknown!
Chris@42 222 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
Chris@42 223 typedef unsigned int uintptr_t;
Chris@42 224 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
Chris@42 225 typedef unsigned long uintptr_t;
Chris@42 226 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
Chris@42 227 typedef unsigned long long uintptr_t;
Chris@42 228 # else
Chris@42 229 # error no unsigned integer type matches void* sizeof!
Chris@42 230 # endif
Chris@42 231 #endif
Chris@42 232
Chris@42 233 /*-----------------------------------------------------------------------*/
Chris@42 234 /* We can do an optimization for copying pairs of (aligned) floats
Chris@42 235 when in single precision if 2*float = double. */
Chris@42 236
Chris@42 237 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
Chris@42 238 && SIZEOF_FLOAT != 0 \
Chris@42 239 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
Chris@42 240
Chris@42 241 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
Chris@42 242
Chris@42 243 /*-----------------------------------------------------------------------*/
Chris@42 244 /* assert.c: */
Chris@42 245 IFFTW_EXTERN void X(assertion_failed)(const char *s,
Chris@42 246 int line, const char *file);
Chris@42 247
Chris@42 248 /* always check */
Chris@42 249 #define CK(ex) \
Chris@42 250 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@42 251
Chris@42 252 #ifdef FFTW_DEBUG
Chris@42 253 /* check only if debug enabled */
Chris@42 254 #define A(ex) \
Chris@42 255 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@42 256 #else
Chris@42 257 #define A(ex) /* nothing */
Chris@42 258 #endif
Chris@42 259
Chris@42 260 extern void X(debug)(const char *format, ...);
Chris@42 261 #define D X(debug)
Chris@42 262
Chris@42 263 /*-----------------------------------------------------------------------*/
Chris@42 264 /* kalloc.c: */
Chris@42 265 extern void *X(kernel_malloc)(size_t n);
Chris@42 266 extern void X(kernel_free)(void *p);
Chris@42 267
Chris@42 268 /*-----------------------------------------------------------------------*/
Chris@42 269 /* alloc.c: */
Chris@42 270
Chris@42 271 /* objects allocated by malloc, for statistical purposes */
Chris@42 272 enum malloc_tag {
Chris@42 273 EVERYTHING,
Chris@42 274 PLANS,
Chris@42 275 SOLVERS,
Chris@42 276 PROBLEMS,
Chris@42 277 BUFFERS,
Chris@42 278 HASHT,
Chris@42 279 TENSORS,
Chris@42 280 PLANNERS,
Chris@42 281 SLVDESCS,
Chris@42 282 TWIDDLES,
Chris@42 283 STRIDES,
Chris@42 284 OTHER,
Chris@42 285 MALLOC_WHAT_LAST /* must be last */
Chris@42 286 };
Chris@42 287
Chris@42 288 IFFTW_EXTERN void X(ifree)(void *ptr);
Chris@42 289 extern void X(ifree0)(void *ptr);
Chris@42 290
Chris@42 291 #ifdef FFTW_DEBUG_MALLOC
Chris@42 292
Chris@42 293 IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what,
Chris@42 294 const char *file, int line);
Chris@42 295 #define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
Chris@42 296 IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose);
Chris@42 297
Chris@42 298 #else /* ! FFTW_DEBUG_MALLOC */
Chris@42 299
Chris@42 300 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
Chris@42 301 #define MALLOC(n, what) X(malloc_plain)(n)
Chris@42 302
Chris@42 303 #endif
Chris@42 304
Chris@42 305 #if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) || defined(HAVE_OPENMP))
Chris@42 306 extern int X(in_thread);
Chris@42 307 # define IN_THREAD X(in_thread)
Chris@42 308 # define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
Chris@42 309 # define THREAD_OFF X(in_thread) = in_thread_save; }
Chris@42 310 #else
Chris@42 311 # define IN_THREAD 0
Chris@42 312 # define THREAD_ON
Chris@42 313 # define THREAD_OFF
Chris@42 314 #endif
Chris@42 315
Chris@42 316 /*-----------------------------------------------------------------------*/
Chris@42 317 /* low-resolution clock */
Chris@42 318
Chris@42 319 #ifdef FAKE_CRUDE_TIME
Chris@42 320 typedef int crude_time;
Chris@42 321 #else
Chris@42 322 # if TIME_WITH_SYS_TIME
Chris@42 323 # include <sys/time.h>
Chris@42 324 # include <time.h>
Chris@42 325 # else
Chris@42 326 # if HAVE_SYS_TIME_H
Chris@42 327 # include <sys/time.h>
Chris@42 328 # else
Chris@42 329 # include <time.h>
Chris@42 330 # endif
Chris@42 331 # endif
Chris@42 332
Chris@42 333 # ifdef HAVE_BSDGETTIMEOFDAY
Chris@42 334 # ifndef HAVE_GETTIMEOFDAY
Chris@42 335 # define gettimeofday BSDgettimeofday
Chris@42 336 # define HAVE_GETTIMEOFDAY 1
Chris@42 337 # endif
Chris@42 338 # endif
Chris@42 339
Chris@42 340 # if defined(HAVE_GETTIMEOFDAY)
Chris@42 341 typedef struct timeval crude_time;
Chris@42 342 # else
Chris@42 343 typedef clock_t crude_time;
Chris@42 344 # endif
Chris@42 345 #endif /* else FAKE_CRUDE_TIME */
Chris@42 346
Chris@42 347 crude_time X(get_crude_time)(void);
Chris@42 348 double X(elapsed_since)(const planner *plnr, const problem *p,
Chris@42 349 crude_time t0); /* time in seconds since t0 */
Chris@42 350
Chris@42 351 /*-----------------------------------------------------------------------*/
Chris@42 352 /* ops.c: */
Chris@42 353 /*
Chris@42 354 * ops counter. The total number of additions is add + fma
Chris@42 355 * and the total number of multiplications is mul + fma.
Chris@42 356 * Total flops = add + mul + 2 * fma
Chris@42 357 */
Chris@42 358 typedef struct {
Chris@42 359 double add;
Chris@42 360 double mul;
Chris@42 361 double fma;
Chris@42 362 double other;
Chris@42 363 } opcnt;
Chris@42 364
Chris@42 365 void X(ops_zero)(opcnt *dst);
Chris@42 366 void X(ops_other)(INT o, opcnt *dst);
Chris@42 367 void X(ops_cpy)(const opcnt *src, opcnt *dst);
Chris@42 368
Chris@42 369 void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
Chris@42 370 void X(ops_add2)(const opcnt *a, opcnt *dst);
Chris@42 371
Chris@42 372 /* dst = m * a + b */
Chris@42 373 void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
Chris@42 374
Chris@42 375 /* dst += m * a */
Chris@42 376 void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
Chris@42 377
Chris@42 378
Chris@42 379 /*-----------------------------------------------------------------------*/
Chris@42 380 /* minmax.c: */
Chris@42 381 INT X(imax)(INT a, INT b);
Chris@42 382 INT X(imin)(INT a, INT b);
Chris@42 383
Chris@42 384 /*-----------------------------------------------------------------------*/
Chris@42 385 /* iabs.c: */
Chris@42 386 INT X(iabs)(INT a);
Chris@42 387
Chris@42 388 /* inline version */
Chris@42 389 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
Chris@42 390
Chris@42 391 /*-----------------------------------------------------------------------*/
Chris@42 392 /* md5.c */
Chris@42 393
Chris@42 394 #if SIZEOF_UNSIGNED_INT >= 4
Chris@42 395 typedef unsigned int md5uint;
Chris@42 396 #else
Chris@42 397 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
Chris@42 398 #endif
Chris@42 399
Chris@42 400 typedef md5uint md5sig[4];
Chris@42 401
Chris@42 402 typedef struct {
Chris@42 403 md5sig s; /* state and signature */
Chris@42 404
Chris@42 405 /* fields not meant to be used outside md5.c: */
Chris@42 406 unsigned char c[64]; /* stuff not yet processed */
Chris@42 407 unsigned l; /* total length. Should be 64 bits long, but this is
Chris@42 408 good enough for us */
Chris@42 409 } md5;
Chris@42 410
Chris@42 411 void X(md5begin)(md5 *p);
Chris@42 412 void X(md5putb)(md5 *p, const void *d_, size_t len);
Chris@42 413 void X(md5puts)(md5 *p, const char *s);
Chris@42 414 void X(md5putc)(md5 *p, unsigned char c);
Chris@42 415 void X(md5int)(md5 *p, int i);
Chris@42 416 void X(md5INT)(md5 *p, INT i);
Chris@42 417 void X(md5unsigned)(md5 *p, unsigned i);
Chris@42 418 void X(md5end)(md5 *p);
Chris@42 419
Chris@42 420 /*-----------------------------------------------------------------------*/
Chris@42 421 /* tensor.c: */
Chris@42 422 #define STRUCT_HACK_KR
Chris@42 423 #undef STRUCT_HACK_C99
Chris@42 424
Chris@42 425 typedef struct {
Chris@42 426 INT n;
Chris@42 427 INT is; /* input stride */
Chris@42 428 INT os; /* output stride */
Chris@42 429 } iodim;
Chris@42 430
Chris@42 431 typedef struct {
Chris@42 432 int rnk;
Chris@42 433 #if defined(STRUCT_HACK_KR)
Chris@42 434 iodim dims[1];
Chris@42 435 #elif defined(STRUCT_HACK_C99)
Chris@42 436 iodim dims[];
Chris@42 437 #else
Chris@42 438 iodim *dims;
Chris@42 439 #endif
Chris@42 440 } tensor;
Chris@42 441
Chris@42 442 /*
Chris@42 443 Definition of rank -infinity.
Chris@42 444 This definition has the property that if you want rank 0 or 1,
Chris@42 445 you can simply test for rank <= 1. This is a common case.
Chris@42 446
Chris@42 447 A tensor of rank -infinity has size 0.
Chris@42 448 */
Chris@42 449 #define RNK_MINFTY INT_MAX
Chris@42 450 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
Chris@42 451
Chris@42 452 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
Chris@42 453
Chris@42 454 tensor *X(mktensor)(int rnk);
Chris@42 455 tensor *X(mktensor_0d)(void);
Chris@42 456 tensor *X(mktensor_1d)(INT n, INT is, INT os);
Chris@42 457 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
Chris@42 458 INT n1, INT is1, INT os1);
Chris@42 459 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
Chris@42 460 INT n1, INT is1, INT os1,
Chris@42 461 INT n2, INT is2, INT os2);
Chris@42 462 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
Chris@42 463 INT n1, INT is1, INT os1,
Chris@42 464 INT n2, INT is2, INT os2,
Chris@42 465 INT n3, INT is3, INT os3);
Chris@42 466 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
Chris@42 467 INT n1, INT is1, INT os1,
Chris@42 468 INT n2, INT is2, INT os2,
Chris@42 469 INT n3, INT is3, INT os3,
Chris@42 470 INT n4, INT is4, INT os4);
Chris@42 471 INT X(tensor_sz)(const tensor *sz);
Chris@42 472 void X(tensor_md5)(md5 *p, const tensor *t);
Chris@42 473 INT X(tensor_max_index)(const tensor *sz);
Chris@42 474 INT X(tensor_min_istride)(const tensor *sz);
Chris@42 475 INT X(tensor_min_ostride)(const tensor *sz);
Chris@42 476 INT X(tensor_min_stride)(const tensor *sz);
Chris@42 477 int X(tensor_inplace_strides)(const tensor *sz);
Chris@42 478 int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
Chris@42 479 int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
Chris@42 480 inplace_kind k);
Chris@42 481 tensor *X(tensor_copy)(const tensor *sz);
Chris@42 482 int X(tensor_kosherp)(const tensor *x);
Chris@42 483
Chris@42 484 tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
Chris@42 485 tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
Chris@42 486 tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
Chris@42 487 tensor *X(tensor_compress)(const tensor *sz);
Chris@42 488 tensor *X(tensor_compress_contiguous)(const tensor *sz);
Chris@42 489 tensor *X(tensor_append)(const tensor *a, const tensor *b);
Chris@42 490 void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
Chris@42 491 int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
Chris@42 492 void X(tensor_destroy)(tensor *sz);
Chris@42 493 void X(tensor_destroy2)(tensor *a, tensor *b);
Chris@42 494 void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
Chris@42 495 void X(tensor_print)(const tensor *sz, printer *p);
Chris@42 496 int X(dimcmp)(const iodim *a, const iodim *b);
Chris@42 497 int X(tensor_equal)(const tensor *a, const tensor *b);
Chris@42 498 int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
Chris@42 499
Chris@42 500 /*-----------------------------------------------------------------------*/
Chris@42 501 /* problem.c: */
Chris@42 502 enum {
Chris@42 503 /* a problem that cannot be solved */
Chris@42 504 PROBLEM_UNSOLVABLE,
Chris@42 505
Chris@42 506 PROBLEM_DFT,
Chris@42 507 PROBLEM_RDFT,
Chris@42 508 PROBLEM_RDFT2,
Chris@42 509
Chris@42 510 /* for mpi/ subdirectory */
Chris@42 511 PROBLEM_MPI_DFT,
Chris@42 512 PROBLEM_MPI_RDFT,
Chris@42 513 PROBLEM_MPI_RDFT2,
Chris@42 514 PROBLEM_MPI_TRANSPOSE,
Chris@42 515
Chris@42 516 PROBLEM_LAST
Chris@42 517 };
Chris@42 518
Chris@42 519 typedef struct {
Chris@42 520 int problem_kind;
Chris@42 521 void (*hash) (const problem *ego, md5 *p);
Chris@42 522 void (*zero) (const problem *ego);
Chris@42 523 void (*print) (const problem *ego, printer *p);
Chris@42 524 void (*destroy) (problem *ego);
Chris@42 525 } problem_adt;
Chris@42 526
Chris@42 527 struct problem_s {
Chris@42 528 const problem_adt *adt;
Chris@42 529 };
Chris@42 530
Chris@42 531 problem *X(mkproblem)(size_t sz, const problem_adt *adt);
Chris@42 532 void X(problem_destroy)(problem *ego);
Chris@42 533 problem *X(mkproblem_unsolvable)(void);
Chris@42 534
Chris@42 535 /*-----------------------------------------------------------------------*/
Chris@42 536 /* print.c */
Chris@42 537 struct printer_s {
Chris@42 538 void (*print)(printer *p, const char *format, ...);
Chris@42 539 void (*vprint)(printer *p, const char *format, va_list ap);
Chris@42 540 void (*putchr)(printer *p, char c);
Chris@42 541 void (*cleanup)(printer *p);
Chris@42 542 int indent;
Chris@42 543 int indent_incr;
Chris@42 544 };
Chris@42 545
Chris@42 546 printer *X(mkprinter)(size_t size,
Chris@42 547 void (*putchr)(printer *p, char c),
Chris@42 548 void (*cleanup)(printer *p));
Chris@42 549 IFFTW_EXTERN void X(printer_destroy)(printer *p);
Chris@42 550
Chris@42 551 /*-----------------------------------------------------------------------*/
Chris@42 552 /* scan.c */
Chris@42 553 struct scanner_s {
Chris@42 554 int (*scan)(scanner *sc, const char *format, ...);
Chris@42 555 int (*vscan)(scanner *sc, const char *format, va_list ap);
Chris@42 556 int (*getchr)(scanner *sc);
Chris@42 557 int ungotc;
Chris@42 558 };
Chris@42 559
Chris@42 560 scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
Chris@42 561 void X(scanner_destroy)(scanner *sc);
Chris@42 562
Chris@42 563 /*-----------------------------------------------------------------------*/
Chris@42 564 /* plan.c: */
Chris@42 565
Chris@42 566 enum wakefulness {
Chris@42 567 SLEEPY,
Chris@42 568 AWAKE_ZERO,
Chris@42 569 AWAKE_SQRTN_TABLE,
Chris@42 570 AWAKE_SINCOS
Chris@42 571 };
Chris@42 572
Chris@42 573 typedef struct {
Chris@42 574 void (*solve)(const plan *ego, const problem *p);
Chris@42 575 void (*awake)(plan *ego, enum wakefulness wakefulness);
Chris@42 576 void (*print)(const plan *ego, printer *p);
Chris@42 577 void (*destroy)(plan *ego);
Chris@42 578 } plan_adt;
Chris@42 579
Chris@42 580 struct plan_s {
Chris@42 581 const plan_adt *adt;
Chris@42 582 opcnt ops;
Chris@42 583 double pcost;
Chris@42 584 enum wakefulness wakefulness; /* used for debugging only */
Chris@42 585 int could_prune_now_p;
Chris@42 586 };
Chris@42 587
Chris@42 588 plan *X(mkplan)(size_t size, const plan_adt *adt);
Chris@42 589 void X(plan_destroy_internal)(plan *ego);
Chris@42 590 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
Chris@42 591 void X(plan_null_destroy)(plan *ego);
Chris@42 592
Chris@42 593 /*-----------------------------------------------------------------------*/
Chris@42 594 /* solver.c: */
Chris@42 595 typedef struct {
Chris@42 596 int problem_kind;
Chris@42 597 plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
Chris@42 598 void (*destroy)(solver *ego);
Chris@42 599 } solver_adt;
Chris@42 600
Chris@42 601 struct solver_s {
Chris@42 602 const solver_adt *adt;
Chris@42 603 int refcnt;
Chris@42 604 };
Chris@42 605
Chris@42 606 solver *X(mksolver)(size_t size, const solver_adt *adt);
Chris@42 607 void X(solver_use)(solver *ego);
Chris@42 608 void X(solver_destroy)(solver *ego);
Chris@42 609 void X(solver_register)(planner *plnr, solver *s);
Chris@42 610
Chris@42 611 /* shorthand */
Chris@42 612 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
Chris@42 613
Chris@42 614 /*-----------------------------------------------------------------------*/
Chris@42 615 /* planner.c */
Chris@42 616
Chris@42 617 typedef struct slvdesc_s {
Chris@42 618 solver *slv;
Chris@42 619 const char *reg_nam;
Chris@42 620 unsigned nam_hash;
Chris@42 621 int reg_id;
Chris@42 622 int next_for_same_problem_kind;
Chris@42 623 } slvdesc;
Chris@42 624
Chris@42 625 typedef struct solution_s solution; /* opaque */
Chris@42 626
Chris@42 627 /* interpretation of L and U:
Chris@42 628
Chris@42 629 - if it returns a plan, the planner guarantees that all applicable
Chris@42 630 plans at least as impatient as U have been tried, and that each
Chris@42 631 plan in the solution is at least as impatient as L.
Chris@42 632
Chris@42 633 - if it returns 0, the planner guarantees to have tried all solvers
Chris@42 634 at least as impatient as L, and that none of them was applicable.
Chris@42 635
Chris@42 636 The structure is packed to fit into 64 bits.
Chris@42 637 */
Chris@42 638
Chris@42 639 typedef struct {
Chris@42 640 unsigned l:20;
Chris@42 641 unsigned hash_info:3;
Chris@42 642 # define BITS_FOR_TIMELIMIT 9
Chris@42 643 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
Chris@42 644 unsigned u:20;
Chris@42 645
Chris@42 646 /* abstraction break: we store the solver here to pad the
Chris@42 647 structure to 64 bits. Otherwise, the struct is padded to 64
Chris@42 648 bits anyway, and another word is allocated for slvndx. */
Chris@42 649 # define BITS_FOR_SLVNDX 12
Chris@42 650 unsigned slvndx:BITS_FOR_SLVNDX;
Chris@42 651 } flags_t;
Chris@42 652
Chris@42 653 /* impatience flags */
Chris@42 654 enum {
Chris@42 655 BELIEVE_PCOST = 0x0001,
Chris@42 656 ESTIMATE = 0x0002,
Chris@42 657 NO_DFT_R2HC = 0x0004,
Chris@42 658 NO_SLOW = 0x0008,
Chris@42 659 NO_VRECURSE = 0x0010,
Chris@42 660 NO_INDIRECT_OP = 0x0020,
Chris@42 661 NO_LARGE_GENERIC = 0x0040,
Chris@42 662 NO_RANK_SPLITS = 0x0080,
Chris@42 663 NO_VRANK_SPLITS = 0x0100,
Chris@42 664 NO_NONTHREADED = 0x0200,
Chris@42 665 NO_BUFFERING = 0x0400,
Chris@42 666 NO_FIXED_RADIX_LARGE_N = 0x0800,
Chris@42 667 NO_DESTROY_INPUT = 0x1000,
Chris@42 668 NO_SIMD = 0x2000,
Chris@42 669 CONSERVE_MEMORY = 0x4000,
Chris@42 670 NO_DHT_R2HC = 0x8000,
Chris@42 671 NO_UGLY = 0x10000,
Chris@42 672 ALLOW_PRUNING = 0x20000
Chris@42 673 };
Chris@42 674
Chris@42 675 /* hashtable information */
Chris@42 676 enum {
Chris@42 677 BLESSING = 0x1u, /* save this entry */
Chris@42 678 H_VALID = 0x2u, /* valid hastable entry */
Chris@42 679 H_LIVE = 0x4u /* entry is nonempty, implies H_VALID */
Chris@42 680 };
Chris@42 681
Chris@42 682 #define PLNR_L(plnr) ((plnr)->flags.l)
Chris@42 683 #define PLNR_U(plnr) ((plnr)->flags.u)
Chris@42 684 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
Chris@42 685
Chris@42 686 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
Chris@42 687 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
Chris@42 688 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
Chris@42 689
Chris@42 690 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
Chris@42 691 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
Chris@42 692 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
Chris@42 693 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
Chris@42 694 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
Chris@42 695 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
Chris@42 696 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
Chris@42 697 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
Chris@42 698 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
Chris@42 699 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
Chris@42 700 #define NO_NONTHREADEDP(plnr) \
Chris@42 701 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
Chris@42 702
Chris@42 703 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
Chris@42 704 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
Chris@42 705 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
Chris@42 706 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
Chris@42 707 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
Chris@42 708
Chris@42 709 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
Chris@42 710
Chris@42 711 typedef enum {
Chris@42 712 /* WISDOM_NORMAL: planner may or may not use wisdom */
Chris@42 713 WISDOM_NORMAL,
Chris@42 714
Chris@42 715 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
Chris@42 716 WISDOM_ONLY,
Chris@42 717
Chris@42 718 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
Chris@42 719 WISDOM_IS_BOGUS,
Chris@42 720
Chris@42 721 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
Chris@42 722 WISDOM_IGNORE_INFEASIBLE,
Chris@42 723
Chris@42 724 /* WISDOM_IGNORE_ALL: planner ignores all */
Chris@42 725 WISDOM_IGNORE_ALL
Chris@42 726 } wisdom_state_t;
Chris@42 727
Chris@42 728 typedef struct {
Chris@42 729 void (*register_solver)(planner *ego, solver *s);
Chris@42 730 plan *(*mkplan)(planner *ego, const problem *p);
Chris@42 731 void (*forget)(planner *ego, amnesia a);
Chris@42 732 void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
Chris@42 733 word in C++. */
Chris@42 734 int (*imprt)(planner *ego, scanner *sc);
Chris@42 735 } planner_adt;
Chris@42 736
Chris@42 737 /* hash table of solutions */
Chris@42 738 typedef struct {
Chris@42 739 solution *solutions;
Chris@42 740 unsigned hashsiz, nelem;
Chris@42 741
Chris@42 742 /* statistics */
Chris@42 743 int lookup, succ_lookup, lookup_iter;
Chris@42 744 int insert, insert_iter, insert_unknown;
Chris@42 745 int nrehash;
Chris@42 746 } hashtab;
Chris@42 747
Chris@42 748 typedef enum { COST_SUM, COST_MAX } cost_kind;
Chris@42 749
Chris@42 750 struct planner_s {
Chris@42 751 const planner_adt *adt;
Chris@42 752 void (*hook)(struct planner_s *plnr, plan *pln,
Chris@42 753 const problem *p, int optimalp);
Chris@42 754 double (*cost_hook)(const problem *p, double t, cost_kind k);
Chris@42 755 int (*wisdom_ok_hook)(const problem *p, flags_t flags);
Chris@42 756 void (*nowisdom_hook)(const problem *p);
Chris@42 757 wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
Chris@42 758
Chris@42 759 /* solver descriptors */
Chris@42 760 slvdesc *slvdescs;
Chris@42 761 unsigned nslvdesc, slvdescsiz;
Chris@42 762 const char *cur_reg_nam;
Chris@42 763 int cur_reg_id;
Chris@42 764 int slvdescs_for_problem_kind[PROBLEM_LAST];
Chris@42 765
Chris@42 766 wisdom_state_t wisdom_state;
Chris@42 767
Chris@42 768 hashtab htab_blessed;
Chris@42 769 hashtab htab_unblessed;
Chris@42 770
Chris@42 771 int nthr;
Chris@42 772 flags_t flags;
Chris@42 773
Chris@42 774 crude_time start_time;
Chris@42 775 double timelimit; /* elapsed_since(start_time) at which to bail out */
Chris@42 776 int timed_out; /* whether most recent search timed out */
Chris@42 777 int need_timeout_check;
Chris@42 778
Chris@42 779 /* various statistics */
Chris@42 780 int nplan; /* number of plans evaluated */
Chris@42 781 double pcost, epcost; /* total pcost of measured/estimated plans */
Chris@42 782 int nprob; /* number of problems evaluated */
Chris@42 783 };
Chris@42 784
Chris@42 785 planner *X(mkplanner)(void);
Chris@42 786 void X(planner_destroy)(planner *ego);
Chris@42 787
Chris@42 788 /*
Chris@42 789 Iterate over all solvers. Read:
Chris@42 790
Chris@42 791 @article{ baker93iterators,
Chris@42 792 author = "Henry G. Baker, Jr.",
Chris@42 793 title = "Iterators: Signs of Weakness in Object-Oriented Languages",
Chris@42 794 journal = "{ACM} {OOPS} Messenger",
Chris@42 795 volume = "4",
Chris@42 796 number = "3",
Chris@42 797 pages = "18--25"
Chris@42 798 }
Chris@42 799 */
Chris@42 800 #define FORALL_SOLVERS(ego, s, p, what) \
Chris@42 801 { \
Chris@42 802 unsigned _cnt; \
Chris@42 803 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
Chris@42 804 slvdesc *p = ego->slvdescs + _cnt; \
Chris@42 805 solver *s = p->slv; \
Chris@42 806 what; \
Chris@42 807 } \
Chris@42 808 }
Chris@42 809
Chris@42 810 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
Chris@42 811 { \
Chris@42 812 int _cnt = ego->slvdescs_for_problem_kind[kind]; \
Chris@42 813 while (_cnt >= 0) { \
Chris@42 814 slvdesc *p = ego->slvdescs + _cnt; \
Chris@42 815 solver *s = p->slv; \
Chris@42 816 what; \
Chris@42 817 _cnt = p->next_for_same_problem_kind; \
Chris@42 818 } \
Chris@42 819 }
Chris@42 820
Chris@42 821
Chris@42 822 /* make plan, destroy problem */
Chris@42 823 plan *X(mkplan_d)(planner *ego, problem *p);
Chris@42 824 plan *X(mkplan_f_d)(planner *ego, problem *p,
Chris@42 825 unsigned l_set, unsigned u_set, unsigned u_reset);
Chris@42 826
Chris@42 827 /*-----------------------------------------------------------------------*/
Chris@42 828 /* stride.c: */
Chris@42 829
Chris@42 830 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
Chris@42 831 #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
Chris@42 832 #define PRECOMPUTE_ARRAY_INDICES
Chris@42 833 #endif
Chris@42 834
Chris@42 835 extern const INT X(an_INT_guaranteed_to_be_zero);
Chris@42 836
Chris@42 837 #ifdef PRECOMPUTE_ARRAY_INDICES
Chris@42 838 typedef INT *stride;
Chris@42 839 #define WS(stride, i) (stride[i])
Chris@42 840 extern stride X(mkstride)(INT n, INT s);
Chris@42 841 void X(stride_destroy)(stride p);
Chris@42 842 /* hackery to prevent the compiler from copying the strides array
Chris@42 843 onto the stack */
Chris@42 844 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
Chris@42 845 #else
Chris@42 846
Chris@42 847 typedef INT stride;
Chris@42 848 #define WS(stride, i) (stride * i)
Chris@42 849 #define fftwf_mkstride(n, stride) stride
Chris@42 850 #define fftw_mkstride(n, stride) stride
Chris@42 851 #define fftwl_mkstride(n, stride) stride
Chris@42 852 #define fftwf_stride_destroy(p) ((void) p)
Chris@42 853 #define fftw_stride_destroy(p) ((void) p)
Chris@42 854 #define fftwl_stride_destroy(p) ((void) p)
Chris@42 855
Chris@42 856 /* hackery to prevent the compiler from ``optimizing'' induction
Chris@42 857 variables in codelet loops. The problem is that for each K and for
Chris@42 858 each expression of the form P[I + STRIDE * K] in a loop, most
Chris@42 859 compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
Chris@42 860 For large values of K this behavior overflows the
Chris@42 861 register set, which is likely worse than doing the index computation
Chris@42 862 in the first place.
Chris@42 863
Chris@42 864 If we guess that there are more than
Chris@42 865 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
Chris@42 866 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
Chris@42 867 be 0, but the compiler does not know this.
Chris@42 868
Chris@42 869 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
Chris@42 870 seem to imply.
Chris@42 871 */
Chris@42 872 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
Chris@42 873 #define MAKE_VOLATILE_STRIDE(nptr, x) \
Chris@42 874 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
Chris@42 875 0 : \
Chris@42 876 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
Chris@42 877 #endif /* PRECOMPUTE_ARRAY_INDICES */
Chris@42 878
Chris@42 879 /*-----------------------------------------------------------------------*/
Chris@42 880 /* solvtab.c */
Chris@42 881
Chris@42 882 struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
Chris@42 883 typedef struct solvtab_s solvtab[];
Chris@42 884 void X(solvtab_exec)(const solvtab tbl, planner *p);
Chris@42 885 #define SOLVTAB(s) { s, STRINGIZE(s) }
Chris@42 886 #define SOLVTAB_END { 0, 0 }
Chris@42 887
Chris@42 888 /*-----------------------------------------------------------------------*/
Chris@42 889 /* pickdim.c */
Chris@42 890 int X(pickdim)(int which_dim, const int *buddies, size_t nbuddies,
Chris@42 891 const tensor *sz, int oop, int *dp);
Chris@42 892
Chris@42 893 /*-----------------------------------------------------------------------*/
Chris@42 894 /* twiddle.c */
Chris@42 895 /* little language to express twiddle factors computation */
Chris@42 896 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
Chris@42 897 TW_FULL = 4, TW_HALF = 5 };
Chris@42 898
Chris@42 899 typedef struct {
Chris@42 900 unsigned char op;
Chris@42 901 signed char v;
Chris@42 902 short i;
Chris@42 903 } tw_instr;
Chris@42 904
Chris@42 905 typedef struct twid_s {
Chris@42 906 R *W; /* array of twiddle factors */
Chris@42 907 INT n, r, m; /* transform order, radix, # twiddle rows */
Chris@42 908 int refcnt;
Chris@42 909 const tw_instr *instr;
Chris@42 910 struct twid_s *cdr;
Chris@42 911 enum wakefulness wakefulness;
Chris@42 912 } twid;
Chris@42 913
Chris@42 914 INT X(twiddle_length)(INT r, const tw_instr *p);
Chris@42 915 void X(twiddle_awake)(enum wakefulness wakefulness,
Chris@42 916 twid **pp, const tw_instr *instr, INT n, INT r, INT m);
Chris@42 917
Chris@42 918 /*-----------------------------------------------------------------------*/
Chris@42 919 /* trig.c */
Chris@42 920 #if defined(TRIGREAL_IS_LONG_DOUBLE)
Chris@42 921 typedef long double trigreal;
Chris@42 922 #elif defined(TRIGREAL_IS_QUAD)
Chris@42 923 typedef __float128 trigreal;
Chris@42 924 #else
Chris@42 925 typedef double trigreal;
Chris@42 926 #endif
Chris@42 927
Chris@42 928 typedef struct triggen_s triggen;
Chris@42 929
Chris@42 930 struct triggen_s {
Chris@42 931 void (*cexp)(triggen *t, INT m, R *result);
Chris@42 932 void (*cexpl)(triggen *t, INT m, trigreal *result);
Chris@42 933 void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
Chris@42 934
Chris@42 935 INT twshft;
Chris@42 936 INT twradix;
Chris@42 937 INT twmsk;
Chris@42 938 trigreal *W0, *W1;
Chris@42 939 INT n;
Chris@42 940 };
Chris@42 941
Chris@42 942 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
Chris@42 943 void X(triggen_destroy)(triggen *p);
Chris@42 944
Chris@42 945 /*-----------------------------------------------------------------------*/
Chris@42 946 /* primes.c: */
Chris@42 947
Chris@42 948 #define MULMOD(x, y, p) \
Chris@42 949 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
Chris@42 950
Chris@42 951 INT X(safe_mulmod)(INT x, INT y, INT p);
Chris@42 952 INT X(power_mod)(INT n, INT m, INT p);
Chris@42 953 INT X(find_generator)(INT p);
Chris@42 954 INT X(first_divisor)(INT n);
Chris@42 955 int X(is_prime)(INT n);
Chris@42 956 INT X(next_prime)(INT n);
Chris@42 957 int X(factors_into)(INT n, const INT *primes);
Chris@42 958 int X(factors_into_small_primes)(INT n);
Chris@42 959 INT X(choose_radix)(INT r, INT n);
Chris@42 960 INT X(isqrt)(INT n);
Chris@42 961 INT X(modulo)(INT a, INT n);
Chris@42 962
Chris@42 963 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
Chris@42 964
Chris@42 965 /* thresholds below which certain solvers are considered SLOW. These are guesses
Chris@42 966 believed to be conservative */
Chris@42 967 #define GENERIC_MAX_SLOW 16
Chris@42 968 #define RADER_MAX_SLOW 32
Chris@42 969 #define BLUESTEIN_MAX_SLOW 24
Chris@42 970
Chris@42 971 /*-----------------------------------------------------------------------*/
Chris@42 972 /* rader.c: */
Chris@42 973 typedef struct rader_tls rader_tl;
Chris@42 974
Chris@42 975 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
Chris@42 976 R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
Chris@42 977 void X(rader_tl_delete)(R *W, rader_tl **tl);
Chris@42 978
Chris@42 979 /*-----------------------------------------------------------------------*/
Chris@42 980 /* copy/transposition routines */
Chris@42 981
Chris@42 982 /* lower bound to the cache size, for tiled routines */
Chris@42 983 #define CACHESIZE 8192
Chris@42 984
Chris@42 985 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
Chris@42 986
Chris@42 987 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
Chris@42 988 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
Chris@42 989 void *args);
Chris@42 990 void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
Chris@42 991 void X(zero1d_pair)(R *O0, R *O1, INT n0, INT os0);
Chris@42 992 void X(cpy2d)(R *I, R *O,
Chris@42 993 INT n0, INT is0, INT os0,
Chris@42 994 INT n1, INT is1, INT os1,
Chris@42 995 INT vl);
Chris@42 996 void X(cpy2d_ci)(R *I, R *O,
Chris@42 997 INT n0, INT is0, INT os0,
Chris@42 998 INT n1, INT is1, INT os1,
Chris@42 999 INT vl);
Chris@42 1000 void X(cpy2d_co)(R *I, R *O,
Chris@42 1001 INT n0, INT is0, INT os0,
Chris@42 1002 INT n1, INT is1, INT os1,
Chris@42 1003 INT vl);
Chris@42 1004 void X(cpy2d_tiled)(R *I, R *O,
Chris@42 1005 INT n0, INT is0, INT os0,
Chris@42 1006 INT n1, INT is1, INT os1,
Chris@42 1007 INT vl);
Chris@42 1008 void X(cpy2d_tiledbuf)(R *I, R *O,
Chris@42 1009 INT n0, INT is0, INT os0,
Chris@42 1010 INT n1, INT is1, INT os1,
Chris@42 1011 INT vl);
Chris@42 1012 void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
Chris@42 1013 INT n0, INT is0, INT os0,
Chris@42 1014 INT n1, INT is1, INT os1);
Chris@42 1015 void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
Chris@42 1016 INT n0, INT is0, INT os0,
Chris@42 1017 INT n1, INT is1, INT os1);
Chris@42 1018 void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
Chris@42 1019 INT n0, INT is0, INT os0,
Chris@42 1020 INT n1, INT is1, INT os1);
Chris@42 1021
Chris@42 1022 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42 1023 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42 1024 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42 1025
Chris@42 1026 typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42 1027 typedef void (*cpy2d_func)(R *I, R *O,
Chris@42 1028 INT n0, INT is0, INT os0,
Chris@42 1029 INT n1, INT is1, INT os1,
Chris@42 1030 INT vl);
Chris@42 1031
Chris@42 1032 /*-----------------------------------------------------------------------*/
Chris@42 1033 /* misc stuff */
Chris@42 1034 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
Chris@42 1035 double X(iestimate_cost)(const planner *, const plan *, const problem *);
Chris@42 1036
Chris@42 1037 #ifdef FFTW_RANDOM_ESTIMATOR
Chris@42 1038 extern unsigned X(random_estimate_seed);
Chris@42 1039 #endif
Chris@42 1040
Chris@42 1041 double X(measure_execution_time)(const planner *plnr,
Chris@42 1042 plan *pln, const problem *p);
Chris@42 1043 IFFTW_EXTERN int X(ialignment_of)(R *p);
Chris@42 1044 unsigned X(hash)(const char *s);
Chris@42 1045 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
Chris@42 1046 int X(nbuf_redundant)(INT n, INT vl, size_t which,
Chris@42 1047 const INT *maxnbuf, size_t nmaxnbuf);
Chris@42 1048 INT X(bufdist)(INT n, INT vl);
Chris@42 1049 int X(toobig)(INT n);
Chris@42 1050 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
Chris@42 1051
Chris@42 1052 #if HAVE_SIMD
Chris@42 1053 R *X(taint)(R *p, INT s);
Chris@42 1054 R *X(join_taint)(R *p1, R *p2);
Chris@42 1055 #define TAINT(p, s) X(taint)(p, s)
Chris@42 1056 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
Chris@42 1057 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
Chris@42 1058 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
Chris@42 1059 #else
Chris@42 1060 #define TAINT(p, s) (p)
Chris@42 1061 #define UNTAINT(p) (p)
Chris@42 1062 #define TAINTOF(p) 0
Chris@42 1063 #define JOIN_TAINT(p1, p2) p1
Chris@42 1064 #endif
Chris@42 1065
Chris@42 1066 #ifdef FFTW_DEBUG_ALIGNMENT
Chris@42 1067 # define ASSERT_ALIGNED_DOUBLE { \
Chris@42 1068 double __foo; \
Chris@42 1069 CK(!(((uintptr_t) &__foo) & 0x7)); \
Chris@42 1070 }
Chris@42 1071 #else
Chris@42 1072 # define ASSERT_ALIGNED_DOUBLE
Chris@42 1073 #endif /* FFTW_DEBUG_ALIGNMENT */
Chris@42 1074
Chris@42 1075
Chris@42 1076
Chris@42 1077 /*-----------------------------------------------------------------------*/
Chris@42 1078 /* macros used in codelets to reduce source code size */
Chris@42 1079
Chris@42 1080 typedef R E; /* internal precision of codelets. */
Chris@42 1081
Chris@42 1082 #if defined(FFTW_LDOUBLE)
Chris@42 1083 # define K(x) ((E) x##L)
Chris@42 1084 #elif defined(FFTW_QUAD)
Chris@42 1085 # define K(x) ((E) x##Q)
Chris@42 1086 #else
Chris@42 1087 # define K(x) ((E) x)
Chris@42 1088 #endif
Chris@42 1089 #define DK(name, value) const E name = K(value)
Chris@42 1090
Chris@42 1091 /* FMA macros */
Chris@42 1092
Chris@42 1093 #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
Chris@42 1094 /* The obvious expression a * b + c does not work. If both x = a * b
Chris@42 1095 + c and y = a * b - c appear in the source, gcc computes t = a * b,
Chris@42 1096 x = t + c, y = t - c, thus destroying the fma.
Chris@42 1097
Chris@42 1098 This peculiar coding seems to do the right thing on all of
Chris@42 1099 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
Chris@42 1100 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
Chris@42 1101 `x' for the single-assignment form).
Chris@42 1102
Chris@42 1103 However, gcc-4.0 is a formidable adversary which succeeds in
Chris@42 1104 pessimizing two fma's into one multiplication and two additions.
Chris@42 1105 It does it very early in the game---before the optimization passes
Chris@42 1106 even start. The only real workaround seems to use fake inline asm
Chris@42 1107 such as
Chris@42 1108
Chris@42 1109 asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
Chris@42 1110 return a * b + c;
Chris@42 1111
Chris@42 1112 in each of the FMA, FMS, FNMA, and FNMS functions. However, this
Chris@42 1113 does not solve the problem either, because two equal asm statements
Chris@42 1114 count as a common subexpression! One must use *different* fake asm
Chris@42 1115 statements:
Chris@42 1116
Chris@42 1117 in FMA:
Chris@42 1118 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
Chris@42 1119
Chris@42 1120 in FMS:
Chris@42 1121 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
Chris@42 1122
Chris@42 1123 etc.
Chris@42 1124
Chris@42 1125 After these changes, gcc recalcitrantly generates the fma that was
Chris@42 1126 in the source to begin with. However, the extra asm() cruft
Chris@42 1127 confuses other passes of gcc, notably the instruction scheduler.
Chris@42 1128 (Of course, one could also generate the fma directly via inline
Chris@42 1129 asm, but this confuses the scheduler even more.)
Chris@42 1130
Chris@42 1131 Steven and I have submitted more than one bug report to the gcc
Chris@42 1132 mailing list over the past few years, to no effect. Thus, I give
Chris@42 1133 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
Chris@42 1134 out before touching this crap again.
Chris@42 1135 */
Chris@42 1136 static __inline__ E FMA(E a, E b, E c)
Chris@42 1137 {
Chris@42 1138 E x = a * b;
Chris@42 1139 x = x + c;
Chris@42 1140 return x;
Chris@42 1141 }
Chris@42 1142
Chris@42 1143 static __inline__ E FMS(E a, E b, E c)
Chris@42 1144 {
Chris@42 1145 E x = a * b;
Chris@42 1146 x = x - c;
Chris@42 1147 return x;
Chris@42 1148 }
Chris@42 1149
Chris@42 1150 static __inline__ E FNMA(E a, E b, E c)
Chris@42 1151 {
Chris@42 1152 E x = a * b;
Chris@42 1153 x = - (x + c);
Chris@42 1154 return x;
Chris@42 1155 }
Chris@42 1156
Chris@42 1157 static __inline__ E FNMS(E a, E b, E c)
Chris@42 1158 {
Chris@42 1159 E x = a * b;
Chris@42 1160 x = - (x - c);
Chris@42 1161 return x;
Chris@42 1162 }
Chris@42 1163 #else
Chris@42 1164 #define FMA(a, b, c) (((a) * (b)) + (c))
Chris@42 1165 #define FMS(a, b, c) (((a) * (b)) - (c))
Chris@42 1166 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
Chris@42 1167 #define FNMS(a, b, c) ((c) - ((a) * (b)))
Chris@42 1168 #endif
Chris@42 1169
Chris@42 1170 #ifdef __cplusplus
Chris@42 1171 } /* extern "C" */
Chris@42 1172 #endif /* __cplusplus */
Chris@42 1173
Chris@42 1174 #endif /* __IFFTW_H__ */