annotate src/fftw-3.3.3/kernel/ifftw.h @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21
Chris@10 22 /* FFTW internal header file */
Chris@10 23 #ifndef __IFFTW_H__
Chris@10 24 #define __IFFTW_H__
Chris@10 25
Chris@10 26 #include "config.h"
Chris@10 27
Chris@10 28 #include <stdlib.h> /* size_t */
Chris@10 29 #include <stdarg.h> /* va_list */
Chris@10 30 #include <stddef.h> /* ptrdiff_t */
Chris@10 31
Chris@10 32 #if HAVE_SYS_TYPES_H
Chris@10 33 # include <sys/types.h>
Chris@10 34 #endif
Chris@10 35
Chris@10 36 #if HAVE_STDINT_H
Chris@10 37 # include <stdint.h> /* uintptr_t, maybe */
Chris@10 38 #endif
Chris@10 39
Chris@10 40 #if HAVE_INTTYPES_H
Chris@10 41 # include <inttypes.h> /* uintptr_t, maybe */
Chris@10 42 #endif
Chris@10 43
Chris@10 44 #ifdef __cplusplus
Chris@10 45 extern "C"
Chris@10 46 {
Chris@10 47 #endif /* __cplusplus */
Chris@10 48
Chris@10 49 /* Windows annoyances -- since tests/hook.c uses some internal
Chris@10 50 FFTW functions, we need to given them the dllexport attribute
Chris@10 51 under Windows when compiling as a DLL (see api/fftw3.h). */
Chris@10 52 #if defined(FFTW_EXTERN)
Chris@10 53 # define IFFTW_EXTERN FFTW_EXTERN
Chris@10 54 #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
Chris@10 55 && (defined(_WIN32) || defined(__WIN32__))
Chris@10 56 # define IFFTW_EXTERN extern __declspec(dllexport)
Chris@10 57 #else
Chris@10 58 # define IFFTW_EXTERN extern
Chris@10 59 #endif
Chris@10 60
Chris@10 61 /* determine precision and name-mangling scheme */
Chris@10 62 #define CONCAT(prefix, name) prefix ## name
Chris@10 63 #if defined(FFTW_SINGLE)
Chris@10 64 typedef float R;
Chris@10 65 # define X(name) CONCAT(fftwf_, name)
Chris@10 66 #elif defined(FFTW_LDOUBLE)
Chris@10 67 typedef long double R;
Chris@10 68 # define X(name) CONCAT(fftwl_, name)
Chris@10 69 # define TRIGREAL_IS_LONG_DOUBLE
Chris@10 70 #elif defined(FFTW_QUAD)
Chris@10 71 typedef __float128 R;
Chris@10 72 # define X(name) CONCAT(fftwq_, name)
Chris@10 73 # define TRIGREAL_IS_QUAD
Chris@10 74 #else
Chris@10 75 typedef double R;
Chris@10 76 # define X(name) CONCAT(fftw_, name)
Chris@10 77 #endif
Chris@10 78
Chris@10 79 /*
Chris@10 80 integral type large enough to contain a stride (what ``int'' should
Chris@10 81 have been in the first place.
Chris@10 82 */
Chris@10 83 typedef ptrdiff_t INT;
Chris@10 84
Chris@10 85 /* dummy use of unused parameters to silence compiler warnings */
Chris@10 86 #define UNUSED(x) (void)x
Chris@10 87
Chris@10 88 #define NELEM(array) ((int) (sizeof(array) / sizeof((array)[0])))
Chris@10 89
Chris@10 90 #define FFT_SIGN (-1) /* sign convention for forward transforms */
Chris@10 91 extern void X(extract_reim)(int sign, R *c, R **r, R **i);
Chris@10 92
Chris@10 93 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
Chris@10 94
Chris@10 95 #define STRINGIZEx(x) #x
Chris@10 96 #define STRINGIZE(x) STRINGIZEx(x)
Chris@10 97 #define CIMPLIES(ante, post) (!(ante) || (post))
Chris@10 98
Chris@10 99 /* define HAVE_SIMD if any simd extensions are supported */
Chris@10 100 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || defined(HAVE_ALTIVEC) || \
Chris@10 101 defined(HAVE_MIPS_PS) || defined(HAVE_AVX)
Chris@10 102 #define HAVE_SIMD 1
Chris@10 103 #else
Chris@10 104 #define HAVE_SIMD 0
Chris@10 105 #endif
Chris@10 106
Chris@10 107 extern int X(have_simd_sse2)(void);
Chris@10 108 extern int X(have_simd_avx)(void);
Chris@10 109 extern int X(have_simd_altivec)(void);
Chris@10 110 extern int X(have_simd_neon)(void);
Chris@10 111
Chris@10 112 /* forward declarations */
Chris@10 113 typedef struct problem_s problem;
Chris@10 114 typedef struct plan_s plan;
Chris@10 115 typedef struct solver_s solver;
Chris@10 116 typedef struct planner_s planner;
Chris@10 117 typedef struct printer_s printer;
Chris@10 118 typedef struct scanner_s scanner;
Chris@10 119
Chris@10 120 /*-----------------------------------------------------------------------*/
Chris@10 121 /* alloca: */
Chris@10 122 #if HAVE_SIMD
Chris@10 123 # ifdef HAVE_AVX
Chris@10 124 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
Chris@10 125 * everything else */
Chris@10 126 # else
Chris@10 127 /* Note that we cannot use 32-byte alignment for all SIMD. For
Chris@10 128 example, MacOS X malloc is 16-byte aligned, but there was no
Chris@10 129 posix_memalign in MacOS X until version 10.6. */
Chris@10 130 # define MIN_ALIGNMENT 16
Chris@10 131 # endif
Chris@10 132 #endif
Chris@10 133
Chris@10 134 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
Chris@10 135 /* use alloca if available */
Chris@10 136
Chris@10 137 #ifndef alloca
Chris@10 138 #ifdef __GNUC__
Chris@10 139 # define alloca __builtin_alloca
Chris@10 140 #else
Chris@10 141 # ifdef _MSC_VER
Chris@10 142 # include <malloc.h>
Chris@10 143 # define alloca _alloca
Chris@10 144 # else
Chris@10 145 # if HAVE_ALLOCA_H
Chris@10 146 # include <alloca.h>
Chris@10 147 # else
Chris@10 148 # ifdef _AIX
Chris@10 149 #pragma alloca
Chris@10 150 # else
Chris@10 151 # ifndef alloca /* predefined by HP cc +Olibcalls */
Chris@10 152 void *alloca(size_t);
Chris@10 153 # endif
Chris@10 154 # endif
Chris@10 155 # endif
Chris@10 156 # endif
Chris@10 157 #endif
Chris@10 158 #endif
Chris@10 159
Chris@10 160 # ifdef MIN_ALIGNMENT
Chris@10 161 # define STACK_MALLOC(T, p, n) \
Chris@10 162 { \
Chris@10 163 p = (T)alloca((n) + MIN_ALIGNMENT); \
Chris@10 164 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
Chris@10 165 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \
Chris@10 166 }
Chris@10 167 # define STACK_FREE(n)
Chris@10 168 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
Chris@10 169 # define STACK_MALLOC(T, p, n) p = (T)alloca(n)
Chris@10 170 # define STACK_FREE(n)
Chris@10 171 # endif
Chris@10 172
Chris@10 173 #else /* ! HAVE_ALLOCA */
Chris@10 174 /* use malloc instead of alloca */
Chris@10 175 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
Chris@10 176 # define STACK_FREE(n) X(ifree)(n)
Chris@10 177 #endif /* ! HAVE_ALLOCA */
Chris@10 178
Chris@10 179 /* allocation of buffers. If these grow too large use malloc(), else
Chris@10 180 use STACK_MALLOC (hopefully reducing to alloca()). */
Chris@10 181
Chris@10 182 /* 64KiB ought to be enough for anybody */
Chris@10 183 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
Chris@10 184
Chris@10 185 #define BUF_ALLOC(T, p, n) \
Chris@10 186 { \
Chris@10 187 if (n < MAX_STACK_ALLOC) { \
Chris@10 188 STACK_MALLOC(T, p, n); \
Chris@10 189 } else { \
Chris@10 190 p = (T)MALLOC(n, BUFFERS); \
Chris@10 191 } \
Chris@10 192 }
Chris@10 193
Chris@10 194 #define BUF_FREE(p, n) \
Chris@10 195 { \
Chris@10 196 if (n < MAX_STACK_ALLOC) { \
Chris@10 197 STACK_FREE(p); \
Chris@10 198 } else { \
Chris@10 199 X(ifree)(p); \
Chris@10 200 } \
Chris@10 201 }
Chris@10 202
Chris@10 203 /*-----------------------------------------------------------------------*/
Chris@10 204 /* define uintptr_t if it is not already defined */
Chris@10 205
Chris@10 206 #ifndef HAVE_UINTPTR_T
Chris@10 207 # if SIZEOF_VOID_P == 0
Chris@10 208 # error sizeof void* is unknown!
Chris@10 209 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
Chris@10 210 typedef unsigned int uintptr_t;
Chris@10 211 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
Chris@10 212 typedef unsigned long uintptr_t;
Chris@10 213 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
Chris@10 214 typedef unsigned long long uintptr_t;
Chris@10 215 # else
Chris@10 216 # error no unsigned integer type matches void* sizeof!
Chris@10 217 # endif
Chris@10 218 #endif
Chris@10 219
Chris@10 220 /*-----------------------------------------------------------------------*/
Chris@10 221 /* We can do an optimization for copying pairs of (aligned) floats
Chris@10 222 when in single precision if 2*float = double. */
Chris@10 223
Chris@10 224 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
Chris@10 225 && SIZEOF_FLOAT != 0 \
Chris@10 226 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
Chris@10 227
Chris@10 228 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
Chris@10 229
Chris@10 230 /*-----------------------------------------------------------------------*/
Chris@10 231 /* assert.c: */
Chris@10 232 IFFTW_EXTERN void X(assertion_failed)(const char *s,
Chris@10 233 int line, const char *file);
Chris@10 234
Chris@10 235 /* always check */
Chris@10 236 #define CK(ex) \
Chris@10 237 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@10 238
Chris@10 239 #ifdef FFTW_DEBUG
Chris@10 240 /* check only if debug enabled */
Chris@10 241 #define A(ex) \
Chris@10 242 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@10 243 #else
Chris@10 244 #define A(ex) /* nothing */
Chris@10 245 #endif
Chris@10 246
Chris@10 247 extern void X(debug)(const char *format, ...);
Chris@10 248 #define D X(debug)
Chris@10 249
Chris@10 250 /*-----------------------------------------------------------------------*/
Chris@10 251 /* kalloc.c: */
Chris@10 252 extern void *X(kernel_malloc)(size_t n);
Chris@10 253 extern void X(kernel_free)(void *p);
Chris@10 254
Chris@10 255 /*-----------------------------------------------------------------------*/
Chris@10 256 /* alloc.c: */
Chris@10 257
Chris@10 258 /* objects allocated by malloc, for statistical purposes */
Chris@10 259 enum malloc_tag {
Chris@10 260 EVERYTHING,
Chris@10 261 PLANS,
Chris@10 262 SOLVERS,
Chris@10 263 PROBLEMS,
Chris@10 264 BUFFERS,
Chris@10 265 HASHT,
Chris@10 266 TENSORS,
Chris@10 267 PLANNERS,
Chris@10 268 SLVDESCS,
Chris@10 269 TWIDDLES,
Chris@10 270 STRIDES,
Chris@10 271 OTHER,
Chris@10 272 MALLOC_WHAT_LAST /* must be last */
Chris@10 273 };
Chris@10 274
Chris@10 275 IFFTW_EXTERN void X(ifree)(void *ptr);
Chris@10 276 extern void X(ifree0)(void *ptr);
Chris@10 277
Chris@10 278 #ifdef FFTW_DEBUG_MALLOC
Chris@10 279
Chris@10 280 IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what,
Chris@10 281 const char *file, int line);
Chris@10 282 #define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
Chris@10 283 IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose);
Chris@10 284
Chris@10 285 #else /* ! FFTW_DEBUG_MALLOC */
Chris@10 286
Chris@10 287 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
Chris@10 288 #define MALLOC(n, what) X(malloc_plain)(n)
Chris@10 289
Chris@10 290 #endif
Chris@10 291
Chris@10 292 #if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) || defined(HAVE_OPENMP))
Chris@10 293 extern int X(in_thread);
Chris@10 294 # define IN_THREAD X(in_thread)
Chris@10 295 # define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
Chris@10 296 # define THREAD_OFF X(in_thread) = in_thread_save; }
Chris@10 297 #else
Chris@10 298 # define IN_THREAD 0
Chris@10 299 # define THREAD_ON
Chris@10 300 # define THREAD_OFF
Chris@10 301 #endif
Chris@10 302
Chris@10 303 /*-----------------------------------------------------------------------*/
Chris@10 304 /* low-resolution clock */
Chris@10 305
Chris@10 306 #ifdef FAKE_CRUDE_TIME
Chris@10 307 typedef int crude_time;
Chris@10 308 #else
Chris@10 309 # if TIME_WITH_SYS_TIME
Chris@10 310 # include <sys/time.h>
Chris@10 311 # include <time.h>
Chris@10 312 # else
Chris@10 313 # if HAVE_SYS_TIME_H
Chris@10 314 # include <sys/time.h>
Chris@10 315 # else
Chris@10 316 # include <time.h>
Chris@10 317 # endif
Chris@10 318 # endif
Chris@10 319
Chris@10 320 # ifdef HAVE_BSDGETTIMEOFDAY
Chris@10 321 # ifndef HAVE_GETTIMEOFDAY
Chris@10 322 # define gettimeofday BSDgettimeofday
Chris@10 323 # define HAVE_GETTIMEOFDAY 1
Chris@10 324 # endif
Chris@10 325 # endif
Chris@10 326
Chris@10 327 # if defined(HAVE_GETTIMEOFDAY)
Chris@10 328 typedef struct timeval crude_time;
Chris@10 329 # else
Chris@10 330 typedef clock_t crude_time;
Chris@10 331 # endif
Chris@10 332 #endif /* else FAKE_CRUDE_TIME */
Chris@10 333
Chris@10 334 crude_time X(get_crude_time)(void);
Chris@10 335 double X(elapsed_since)(const planner *plnr, const problem *p,
Chris@10 336 crude_time t0); /* time in seconds since t0 */
Chris@10 337
Chris@10 338 /*-----------------------------------------------------------------------*/
Chris@10 339 /* ops.c: */
Chris@10 340 /*
Chris@10 341 * ops counter. The total number of additions is add + fma
Chris@10 342 * and the total number of multiplications is mul + fma.
Chris@10 343 * Total flops = add + mul + 2 * fma
Chris@10 344 */
Chris@10 345 typedef struct {
Chris@10 346 double add;
Chris@10 347 double mul;
Chris@10 348 double fma;
Chris@10 349 double other;
Chris@10 350 } opcnt;
Chris@10 351
Chris@10 352 void X(ops_zero)(opcnt *dst);
Chris@10 353 void X(ops_other)(INT o, opcnt *dst);
Chris@10 354 void X(ops_cpy)(const opcnt *src, opcnt *dst);
Chris@10 355
Chris@10 356 void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
Chris@10 357 void X(ops_add2)(const opcnt *a, opcnt *dst);
Chris@10 358
Chris@10 359 /* dst = m * a + b */
Chris@10 360 void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
Chris@10 361
Chris@10 362 /* dst += m * a */
Chris@10 363 void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
Chris@10 364
Chris@10 365
Chris@10 366 /*-----------------------------------------------------------------------*/
Chris@10 367 /* minmax.c: */
Chris@10 368 INT X(imax)(INT a, INT b);
Chris@10 369 INT X(imin)(INT a, INT b);
Chris@10 370
Chris@10 371 /*-----------------------------------------------------------------------*/
Chris@10 372 /* iabs.c: */
Chris@10 373 INT X(iabs)(INT a);
Chris@10 374
Chris@10 375 /* inline version */
Chris@10 376 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
Chris@10 377
Chris@10 378 /*-----------------------------------------------------------------------*/
Chris@10 379 /* md5.c */
Chris@10 380
Chris@10 381 #if SIZEOF_UNSIGNED_INT >= 4
Chris@10 382 typedef unsigned int md5uint;
Chris@10 383 #else
Chris@10 384 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
Chris@10 385 #endif
Chris@10 386
Chris@10 387 typedef md5uint md5sig[4];
Chris@10 388
Chris@10 389 typedef struct {
Chris@10 390 md5sig s; /* state and signature */
Chris@10 391
Chris@10 392 /* fields not meant to be used outside md5.c: */
Chris@10 393 unsigned char c[64]; /* stuff not yet processed */
Chris@10 394 unsigned l; /* total length. Should be 64 bits long, but this is
Chris@10 395 good enough for us */
Chris@10 396 } md5;
Chris@10 397
Chris@10 398 void X(md5begin)(md5 *p);
Chris@10 399 void X(md5putb)(md5 *p, const void *d_, size_t len);
Chris@10 400 void X(md5puts)(md5 *p, const char *s);
Chris@10 401 void X(md5putc)(md5 *p, unsigned char c);
Chris@10 402 void X(md5int)(md5 *p, int i);
Chris@10 403 void X(md5INT)(md5 *p, INT i);
Chris@10 404 void X(md5unsigned)(md5 *p, unsigned i);
Chris@10 405 void X(md5end)(md5 *p);
Chris@10 406
Chris@10 407 /*-----------------------------------------------------------------------*/
Chris@10 408 /* tensor.c: */
Chris@10 409 #define STRUCT_HACK_KR
Chris@10 410 #undef STRUCT_HACK_C99
Chris@10 411
Chris@10 412 typedef struct {
Chris@10 413 INT n;
Chris@10 414 INT is; /* input stride */
Chris@10 415 INT os; /* output stride */
Chris@10 416 } iodim;
Chris@10 417
Chris@10 418 typedef struct {
Chris@10 419 int rnk;
Chris@10 420 #if defined(STRUCT_HACK_KR)
Chris@10 421 iodim dims[1];
Chris@10 422 #elif defined(STRUCT_HACK_C99)
Chris@10 423 iodim dims[];
Chris@10 424 #else
Chris@10 425 iodim *dims;
Chris@10 426 #endif
Chris@10 427 } tensor;
Chris@10 428
Chris@10 429 /*
Chris@10 430 Definition of rank -infinity.
Chris@10 431 This definition has the property that if you want rank 0 or 1,
Chris@10 432 you can simply test for rank <= 1. This is a common case.
Chris@10 433
Chris@10 434 A tensor of rank -infinity has size 0.
Chris@10 435 */
Chris@10 436 #define RNK_MINFTY ((int)(((unsigned) -1) >> 1))
Chris@10 437 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
Chris@10 438
Chris@10 439 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
Chris@10 440
Chris@10 441 tensor *X(mktensor)(int rnk);
Chris@10 442 tensor *X(mktensor_0d)(void);
Chris@10 443 tensor *X(mktensor_1d)(INT n, INT is, INT os);
Chris@10 444 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
Chris@10 445 INT n1, INT is1, INT os1);
Chris@10 446 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
Chris@10 447 INT n1, INT is1, INT os1,
Chris@10 448 INT n2, INT is2, INT os2);
Chris@10 449 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
Chris@10 450 INT n1, INT is1, INT os1,
Chris@10 451 INT n2, INT is2, INT os2,
Chris@10 452 INT n3, INT is3, INT os3);
Chris@10 453 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
Chris@10 454 INT n1, INT is1, INT os1,
Chris@10 455 INT n2, INT is2, INT os2,
Chris@10 456 INT n3, INT is3, INT os3,
Chris@10 457 INT n4, INT is4, INT os4);
Chris@10 458 INT X(tensor_sz)(const tensor *sz);
Chris@10 459 void X(tensor_md5)(md5 *p, const tensor *t);
Chris@10 460 INT X(tensor_max_index)(const tensor *sz);
Chris@10 461 INT X(tensor_min_istride)(const tensor *sz);
Chris@10 462 INT X(tensor_min_ostride)(const tensor *sz);
Chris@10 463 INT X(tensor_min_stride)(const tensor *sz);
Chris@10 464 int X(tensor_inplace_strides)(const tensor *sz);
Chris@10 465 int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
Chris@10 466 int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
Chris@10 467 inplace_kind k);
Chris@10 468 tensor *X(tensor_copy)(const tensor *sz);
Chris@10 469 int X(tensor_kosherp)(const tensor *x);
Chris@10 470
Chris@10 471 tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
Chris@10 472 tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
Chris@10 473 tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
Chris@10 474 tensor *X(tensor_compress)(const tensor *sz);
Chris@10 475 tensor *X(tensor_compress_contiguous)(const tensor *sz);
Chris@10 476 tensor *X(tensor_append)(const tensor *a, const tensor *b);
Chris@10 477 void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
Chris@10 478 int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
Chris@10 479 void X(tensor_destroy)(tensor *sz);
Chris@10 480 void X(tensor_destroy2)(tensor *a, tensor *b);
Chris@10 481 void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
Chris@10 482 void X(tensor_print)(const tensor *sz, printer *p);
Chris@10 483 int X(dimcmp)(const iodim *a, const iodim *b);
Chris@10 484 int X(tensor_equal)(const tensor *a, const tensor *b);
Chris@10 485 int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
Chris@10 486
Chris@10 487 /*-----------------------------------------------------------------------*/
Chris@10 488 /* problem.c: */
Chris@10 489 enum {
Chris@10 490 /* a problem that cannot be solved */
Chris@10 491 PROBLEM_UNSOLVABLE,
Chris@10 492
Chris@10 493 PROBLEM_DFT,
Chris@10 494 PROBLEM_RDFT,
Chris@10 495 PROBLEM_RDFT2,
Chris@10 496
Chris@10 497 /* for mpi/ subdirectory */
Chris@10 498 PROBLEM_MPI_DFT,
Chris@10 499 PROBLEM_MPI_RDFT,
Chris@10 500 PROBLEM_MPI_RDFT2,
Chris@10 501 PROBLEM_MPI_TRANSPOSE,
Chris@10 502
Chris@10 503 PROBLEM_LAST
Chris@10 504 };
Chris@10 505
Chris@10 506 typedef struct {
Chris@10 507 int problem_kind;
Chris@10 508 void (*hash) (const problem *ego, md5 *p);
Chris@10 509 void (*zero) (const problem *ego);
Chris@10 510 void (*print) (const problem *ego, printer *p);
Chris@10 511 void (*destroy) (problem *ego);
Chris@10 512 } problem_adt;
Chris@10 513
Chris@10 514 struct problem_s {
Chris@10 515 const problem_adt *adt;
Chris@10 516 };
Chris@10 517
Chris@10 518 problem *X(mkproblem)(size_t sz, const problem_adt *adt);
Chris@10 519 void X(problem_destroy)(problem *ego);
Chris@10 520 problem *X(mkproblem_unsolvable)(void);
Chris@10 521
Chris@10 522 /*-----------------------------------------------------------------------*/
Chris@10 523 /* print.c */
Chris@10 524 struct printer_s {
Chris@10 525 void (*print)(printer *p, const char *format, ...);
Chris@10 526 void (*vprint)(printer *p, const char *format, va_list ap);
Chris@10 527 void (*putchr)(printer *p, char c);
Chris@10 528 void (*cleanup)(printer *p);
Chris@10 529 int indent;
Chris@10 530 int indent_incr;
Chris@10 531 };
Chris@10 532
Chris@10 533 printer *X(mkprinter)(size_t size,
Chris@10 534 void (*putchr)(printer *p, char c),
Chris@10 535 void (*cleanup)(printer *p));
Chris@10 536 IFFTW_EXTERN void X(printer_destroy)(printer *p);
Chris@10 537
Chris@10 538 /*-----------------------------------------------------------------------*/
Chris@10 539 /* scan.c */
Chris@10 540 struct scanner_s {
Chris@10 541 int (*scan)(scanner *sc, const char *format, ...);
Chris@10 542 int (*vscan)(scanner *sc, const char *format, va_list ap);
Chris@10 543 int (*getchr)(scanner *sc);
Chris@10 544 int ungotc;
Chris@10 545 };
Chris@10 546
Chris@10 547 scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
Chris@10 548 void X(scanner_destroy)(scanner *sc);
Chris@10 549
Chris@10 550 /*-----------------------------------------------------------------------*/
Chris@10 551 /* plan.c: */
Chris@10 552
Chris@10 553 enum wakefulness {
Chris@10 554 SLEEPY,
Chris@10 555 AWAKE_ZERO,
Chris@10 556 AWAKE_SQRTN_TABLE,
Chris@10 557 AWAKE_SINCOS
Chris@10 558 };
Chris@10 559
Chris@10 560 typedef struct {
Chris@10 561 void (*solve)(const plan *ego, const problem *p);
Chris@10 562 void (*awake)(plan *ego, enum wakefulness wakefulness);
Chris@10 563 void (*print)(const plan *ego, printer *p);
Chris@10 564 void (*destroy)(plan *ego);
Chris@10 565 } plan_adt;
Chris@10 566
Chris@10 567 struct plan_s {
Chris@10 568 const plan_adt *adt;
Chris@10 569 opcnt ops;
Chris@10 570 double pcost;
Chris@10 571 enum wakefulness wakefulness; /* used for debugging only */
Chris@10 572 int could_prune_now_p;
Chris@10 573 };
Chris@10 574
Chris@10 575 plan *X(mkplan)(size_t size, const plan_adt *adt);
Chris@10 576 void X(plan_destroy_internal)(plan *ego);
Chris@10 577 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
Chris@10 578 void X(plan_null_destroy)(plan *ego);
Chris@10 579
Chris@10 580 /*-----------------------------------------------------------------------*/
Chris@10 581 /* solver.c: */
Chris@10 582 typedef struct {
Chris@10 583 int problem_kind;
Chris@10 584 plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
Chris@10 585 void (*destroy)(solver *ego);
Chris@10 586 } solver_adt;
Chris@10 587
Chris@10 588 struct solver_s {
Chris@10 589 const solver_adt *adt;
Chris@10 590 int refcnt;
Chris@10 591 };
Chris@10 592
Chris@10 593 solver *X(mksolver)(size_t size, const solver_adt *adt);
Chris@10 594 void X(solver_use)(solver *ego);
Chris@10 595 void X(solver_destroy)(solver *ego);
Chris@10 596 void X(solver_register)(planner *plnr, solver *s);
Chris@10 597
Chris@10 598 /* shorthand */
Chris@10 599 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
Chris@10 600
Chris@10 601 /*-----------------------------------------------------------------------*/
Chris@10 602 /* planner.c */
Chris@10 603
Chris@10 604 typedef struct slvdesc_s {
Chris@10 605 solver *slv;
Chris@10 606 const char *reg_nam;
Chris@10 607 unsigned nam_hash;
Chris@10 608 int reg_id;
Chris@10 609 int next_for_same_problem_kind;
Chris@10 610 } slvdesc;
Chris@10 611
Chris@10 612 typedef struct solution_s solution; /* opaque */
Chris@10 613
Chris@10 614 /* interpretation of L and U:
Chris@10 615
Chris@10 616 - if it returns a plan, the planner guarantees that all applicable
Chris@10 617 plans at least as impatient as U have been tried, and that each
Chris@10 618 plan in the solution is at least as impatient as L.
Chris@10 619
Chris@10 620 - if it returns 0, the planner guarantees to have tried all solvers
Chris@10 621 at least as impatient as L, and that none of them was applicable.
Chris@10 622
Chris@10 623 The structure is packed to fit into 64 bits.
Chris@10 624 */
Chris@10 625
Chris@10 626 typedef struct {
Chris@10 627 unsigned l:20;
Chris@10 628 unsigned hash_info:3;
Chris@10 629 # define BITS_FOR_TIMELIMIT 9
Chris@10 630 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
Chris@10 631 unsigned u:20;
Chris@10 632
Chris@10 633 /* abstraction break: we store the solver here to pad the
Chris@10 634 structure to 64 bits. Otherwise, the struct is padded to 64
Chris@10 635 bits anyway, and another word is allocated for slvndx. */
Chris@10 636 # define BITS_FOR_SLVNDX 12
Chris@10 637 unsigned slvndx:BITS_FOR_SLVNDX;
Chris@10 638 } flags_t;
Chris@10 639
Chris@10 640 /* impatience flags */
Chris@10 641 enum {
Chris@10 642 BELIEVE_PCOST = 0x0001,
Chris@10 643 ESTIMATE = 0x0002,
Chris@10 644 NO_DFT_R2HC = 0x0004,
Chris@10 645 NO_SLOW = 0x0008,
Chris@10 646 NO_VRECURSE = 0x0010,
Chris@10 647 NO_INDIRECT_OP = 0x0020,
Chris@10 648 NO_LARGE_GENERIC = 0x0040,
Chris@10 649 NO_RANK_SPLITS = 0x0080,
Chris@10 650 NO_VRANK_SPLITS = 0x0100,
Chris@10 651 NO_NONTHREADED = 0x0200,
Chris@10 652 NO_BUFFERING = 0x0400,
Chris@10 653 NO_FIXED_RADIX_LARGE_N = 0x0800,
Chris@10 654 NO_DESTROY_INPUT = 0x1000,
Chris@10 655 NO_SIMD = 0x2000,
Chris@10 656 CONSERVE_MEMORY = 0x4000,
Chris@10 657 NO_DHT_R2HC = 0x8000,
Chris@10 658 NO_UGLY = 0x10000,
Chris@10 659 ALLOW_PRUNING = 0x20000
Chris@10 660 };
Chris@10 661
Chris@10 662 /* hashtable information */
Chris@10 663 enum {
Chris@10 664 BLESSING = 0x1, /* save this entry */
Chris@10 665 H_VALID = 0x2, /* valid hastable entry */
Chris@10 666 H_LIVE = 0x4 /* entry is nonempty, implies H_VALID */
Chris@10 667 };
Chris@10 668
Chris@10 669 #define PLNR_L(plnr) ((plnr)->flags.l)
Chris@10 670 #define PLNR_U(plnr) ((plnr)->flags.u)
Chris@10 671 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
Chris@10 672
Chris@10 673 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
Chris@10 674 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
Chris@10 675 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
Chris@10 676
Chris@10 677 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
Chris@10 678 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
Chris@10 679 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
Chris@10 680 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
Chris@10 681 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
Chris@10 682 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
Chris@10 683 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
Chris@10 684 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
Chris@10 685 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
Chris@10 686 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
Chris@10 687 #define NO_NONTHREADEDP(plnr) \
Chris@10 688 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
Chris@10 689
Chris@10 690 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
Chris@10 691 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
Chris@10 692 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
Chris@10 693 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
Chris@10 694 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
Chris@10 695
Chris@10 696 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
Chris@10 697
Chris@10 698 typedef enum {
Chris@10 699 /* WISDOM_NORMAL: planner may or may not use wisdom */
Chris@10 700 WISDOM_NORMAL,
Chris@10 701
Chris@10 702 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
Chris@10 703 WISDOM_ONLY,
Chris@10 704
Chris@10 705 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
Chris@10 706 WISDOM_IS_BOGUS,
Chris@10 707
Chris@10 708 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
Chris@10 709 WISDOM_IGNORE_INFEASIBLE,
Chris@10 710
Chris@10 711 /* WISDOM_IGNORE_ALL: planner ignores all */
Chris@10 712 WISDOM_IGNORE_ALL
Chris@10 713 } wisdom_state_t;
Chris@10 714
Chris@10 715 typedef struct {
Chris@10 716 void (*register_solver)(planner *ego, solver *s);
Chris@10 717 plan *(*mkplan)(planner *ego, const problem *p);
Chris@10 718 void (*forget)(planner *ego, amnesia a);
Chris@10 719 void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
Chris@10 720 word in C++. */
Chris@10 721 int (*imprt)(planner *ego, scanner *sc);
Chris@10 722 } planner_adt;
Chris@10 723
Chris@10 724 /* hash table of solutions */
Chris@10 725 typedef struct {
Chris@10 726 solution *solutions;
Chris@10 727 unsigned hashsiz, nelem;
Chris@10 728
Chris@10 729 /* statistics */
Chris@10 730 int lookup, succ_lookup, lookup_iter;
Chris@10 731 int insert, insert_iter, insert_unknown;
Chris@10 732 int nrehash;
Chris@10 733 } hashtab;
Chris@10 734
Chris@10 735 typedef enum { COST_SUM, COST_MAX } cost_kind;
Chris@10 736
Chris@10 737 struct planner_s {
Chris@10 738 const planner_adt *adt;
Chris@10 739 void (*hook)(struct planner_s *plnr, plan *pln,
Chris@10 740 const problem *p, int optimalp);
Chris@10 741 double (*cost_hook)(const problem *p, double t, cost_kind k);
Chris@10 742 int (*wisdom_ok_hook)(const problem *p, flags_t flags);
Chris@10 743 void (*nowisdom_hook)(const problem *p);
Chris@10 744 wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
Chris@10 745
Chris@10 746 /* solver descriptors */
Chris@10 747 slvdesc *slvdescs;
Chris@10 748 unsigned nslvdesc, slvdescsiz;
Chris@10 749 const char *cur_reg_nam;
Chris@10 750 int cur_reg_id;
Chris@10 751 int slvdescs_for_problem_kind[PROBLEM_LAST];
Chris@10 752
Chris@10 753 wisdom_state_t wisdom_state;
Chris@10 754
Chris@10 755 hashtab htab_blessed;
Chris@10 756 hashtab htab_unblessed;
Chris@10 757
Chris@10 758 int nthr;
Chris@10 759 flags_t flags;
Chris@10 760
Chris@10 761 crude_time start_time;
Chris@10 762 double timelimit; /* elapsed_since(start_time) at which to bail out */
Chris@10 763 int timed_out; /* whether most recent search timed out */
Chris@10 764 int need_timeout_check;
Chris@10 765
Chris@10 766 /* various statistics */
Chris@10 767 int nplan; /* number of plans evaluated */
Chris@10 768 double pcost, epcost; /* total pcost of measured/estimated plans */
Chris@10 769 int nprob; /* number of problems evaluated */
Chris@10 770 };
Chris@10 771
Chris@10 772 planner *X(mkplanner)(void);
Chris@10 773 void X(planner_destroy)(planner *ego);
Chris@10 774
Chris@10 775 /*
Chris@10 776 Iterate over all solvers. Read:
Chris@10 777
Chris@10 778 @article{ baker93iterators,
Chris@10 779 author = "Henry G. Baker, Jr.",
Chris@10 780 title = "Iterators: Signs of Weakness in Object-Oriented Languages",
Chris@10 781 journal = "{ACM} {OOPS} Messenger",
Chris@10 782 volume = "4",
Chris@10 783 number = "3",
Chris@10 784 pages = "18--25"
Chris@10 785 }
Chris@10 786 */
Chris@10 787 #define FORALL_SOLVERS(ego, s, p, what) \
Chris@10 788 { \
Chris@10 789 unsigned _cnt; \
Chris@10 790 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
Chris@10 791 slvdesc *p = ego->slvdescs + _cnt; \
Chris@10 792 solver *s = p->slv; \
Chris@10 793 what; \
Chris@10 794 } \
Chris@10 795 }
Chris@10 796
Chris@10 797 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
Chris@10 798 { \
Chris@10 799 int _cnt = ego->slvdescs_for_problem_kind[kind]; \
Chris@10 800 while (_cnt >= 0) { \
Chris@10 801 slvdesc *p = ego->slvdescs + _cnt; \
Chris@10 802 solver *s = p->slv; \
Chris@10 803 what; \
Chris@10 804 _cnt = p->next_for_same_problem_kind; \
Chris@10 805 } \
Chris@10 806 }
Chris@10 807
Chris@10 808
Chris@10 809 /* make plan, destroy problem */
Chris@10 810 plan *X(mkplan_d)(planner *ego, problem *p);
Chris@10 811 plan *X(mkplan_f_d)(planner *ego, problem *p,
Chris@10 812 unsigned l_set, unsigned u_set, unsigned u_reset);
Chris@10 813
Chris@10 814 /*-----------------------------------------------------------------------*/
Chris@10 815 /* stride.c: */
Chris@10 816
Chris@10 817 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
Chris@10 818 #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
Chris@10 819 #define PRECOMPUTE_ARRAY_INDICES
Chris@10 820 #endif
Chris@10 821
Chris@10 822 extern const INT X(an_INT_guaranteed_to_be_zero);
Chris@10 823
Chris@10 824 #ifdef PRECOMPUTE_ARRAY_INDICES
Chris@10 825 typedef INT *stride;
Chris@10 826 #define WS(stride, i) (stride[i])
Chris@10 827 extern stride X(mkstride)(INT n, INT s);
Chris@10 828 void X(stride_destroy)(stride p);
Chris@10 829 /* hackery to prevent the compiler from copying the strides array
Chris@10 830 onto the stack */
Chris@10 831 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
Chris@10 832 #else
Chris@10 833
Chris@10 834 typedef INT stride;
Chris@10 835 #define WS(stride, i) (stride * i)
Chris@10 836 #define fftwf_mkstride(n, stride) stride
Chris@10 837 #define fftw_mkstride(n, stride) stride
Chris@10 838 #define fftwl_mkstride(n, stride) stride
Chris@10 839 #define fftwf_stride_destroy(p) ((void) p)
Chris@10 840 #define fftw_stride_destroy(p) ((void) p)
Chris@10 841 #define fftwl_stride_destroy(p) ((void) p)
Chris@10 842
Chris@10 843 /* hackery to prevent the compiler from ``optimizing'' induction
Chris@10 844 variables in codelet loops. The problem is that for each K and for
Chris@10 845 each expression of the form P[I + STRIDE * K] in a loop, most
Chris@10 846 compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
Chris@10 847 For large values of K this behavior overflows the
Chris@10 848 register set, which is likely worse than doing the index computation
Chris@10 849 in the first place.
Chris@10 850
Chris@10 851 If we guess that there are more than
Chris@10 852 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
Chris@10 853 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
Chris@10 854 be 0, but the compiler does not know this.
Chris@10 855
Chris@10 856 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
Chris@10 857 seem to imply.
Chris@10 858 */
Chris@10 859 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
Chris@10 860 #define MAKE_VOLATILE_STRIDE(nptr, x) \
Chris@10 861 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
Chris@10 862 0 : \
Chris@10 863 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
Chris@10 864 #endif /* PRECOMPUTE_ARRAY_INDICES */
Chris@10 865
Chris@10 866 /*-----------------------------------------------------------------------*/
Chris@10 867 /* solvtab.c */
Chris@10 868
Chris@10 869 struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
Chris@10 870 typedef struct solvtab_s solvtab[];
Chris@10 871 void X(solvtab_exec)(const solvtab tbl, planner *p);
Chris@10 872 #define SOLVTAB(s) { s, STRINGIZE(s) }
Chris@10 873 #define SOLVTAB_END { 0, 0 }
Chris@10 874
Chris@10 875 /*-----------------------------------------------------------------------*/
Chris@10 876 /* pickdim.c */
Chris@10 877 int X(pickdim)(int which_dim, const int *buddies, int nbuddies,
Chris@10 878 const tensor *sz, int oop, int *dp);
Chris@10 879
Chris@10 880 /*-----------------------------------------------------------------------*/
Chris@10 881 /* twiddle.c */
Chris@10 882 /* little language to express twiddle factors computation */
Chris@10 883 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
Chris@10 884 TW_FULL = 4, TW_HALF = 5 };
Chris@10 885
Chris@10 886 typedef struct {
Chris@10 887 unsigned char op;
Chris@10 888 signed char v;
Chris@10 889 short i;
Chris@10 890 } tw_instr;
Chris@10 891
Chris@10 892 typedef struct twid_s {
Chris@10 893 R *W; /* array of twiddle factors */
Chris@10 894 INT n, r, m; /* transform order, radix, # twiddle rows */
Chris@10 895 int refcnt;
Chris@10 896 const tw_instr *instr;
Chris@10 897 struct twid_s *cdr;
Chris@10 898 enum wakefulness wakefulness;
Chris@10 899 } twid;
Chris@10 900
Chris@10 901 INT X(twiddle_length)(INT r, const tw_instr *p);
Chris@10 902 void X(twiddle_awake)(enum wakefulness wakefulness,
Chris@10 903 twid **pp, const tw_instr *instr, INT n, INT r, INT m);
Chris@10 904
Chris@10 905 /*-----------------------------------------------------------------------*/
Chris@10 906 /* trig.c */
Chris@10 907 #if defined(TRIGREAL_IS_LONG_DOUBLE)
Chris@10 908 typedef long double trigreal;
Chris@10 909 #elif defined(TRIGREAL_IS_QUAD)
Chris@10 910 typedef __float128 trigreal;
Chris@10 911 #else
Chris@10 912 typedef double trigreal;
Chris@10 913 #endif
Chris@10 914
Chris@10 915 typedef struct triggen_s triggen;
Chris@10 916
Chris@10 917 struct triggen_s {
Chris@10 918 void (*cexp)(triggen *t, INT m, R *result);
Chris@10 919 void (*cexpl)(triggen *t, INT m, trigreal *result);
Chris@10 920 void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
Chris@10 921
Chris@10 922 INT twshft;
Chris@10 923 INT twradix;
Chris@10 924 INT twmsk;
Chris@10 925 trigreal *W0, *W1;
Chris@10 926 INT n;
Chris@10 927 };
Chris@10 928
Chris@10 929 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
Chris@10 930 void X(triggen_destroy)(triggen *p);
Chris@10 931
Chris@10 932 /*-----------------------------------------------------------------------*/
Chris@10 933 /* primes.c: */
Chris@10 934
Chris@10 935 #define MULMOD(x, y, p) \
Chris@10 936 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
Chris@10 937
Chris@10 938 INT X(safe_mulmod)(INT x, INT y, INT p);
Chris@10 939 INT X(power_mod)(INT n, INT m, INT p);
Chris@10 940 INT X(find_generator)(INT p);
Chris@10 941 INT X(first_divisor)(INT n);
Chris@10 942 int X(is_prime)(INT n);
Chris@10 943 INT X(next_prime)(INT n);
Chris@10 944 int X(factors_into)(INT n, const INT *primes);
Chris@10 945 int X(factors_into_small_primes)(INT n);
Chris@10 946 INT X(choose_radix)(INT r, INT n);
Chris@10 947 INT X(isqrt)(INT n);
Chris@10 948 INT X(modulo)(INT a, INT n);
Chris@10 949
Chris@10 950 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
Chris@10 951
Chris@10 952 /* thresholds below which certain solvers are considered SLOW. These are guesses
Chris@10 953 believed to be conservative */
Chris@10 954 #define GENERIC_MAX_SLOW 16
Chris@10 955 #define RADER_MAX_SLOW 32
Chris@10 956 #define BLUESTEIN_MAX_SLOW 24
Chris@10 957
Chris@10 958 /*-----------------------------------------------------------------------*/
Chris@10 959 /* rader.c: */
Chris@10 960 typedef struct rader_tls rader_tl;
Chris@10 961
Chris@10 962 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
Chris@10 963 R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
Chris@10 964 void X(rader_tl_delete)(R *W, rader_tl **tl);
Chris@10 965
Chris@10 966 /*-----------------------------------------------------------------------*/
Chris@10 967 /* copy/transposition routines */
Chris@10 968
Chris@10 969 /* lower bound to the cache size, for tiled routines */
Chris@10 970 #define CACHESIZE 8192
Chris@10 971
Chris@10 972 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
Chris@10 973
Chris@10 974 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
Chris@10 975 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
Chris@10 976 void *args);
Chris@10 977 void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
Chris@10 978 void X(cpy2d)(R *I, R *O,
Chris@10 979 INT n0, INT is0, INT os0,
Chris@10 980 INT n1, INT is1, INT os1,
Chris@10 981 INT vl);
Chris@10 982 void X(cpy2d_ci)(R *I, R *O,
Chris@10 983 INT n0, INT is0, INT os0,
Chris@10 984 INT n1, INT is1, INT os1,
Chris@10 985 INT vl);
Chris@10 986 void X(cpy2d_co)(R *I, R *O,
Chris@10 987 INT n0, INT is0, INT os0,
Chris@10 988 INT n1, INT is1, INT os1,
Chris@10 989 INT vl);
Chris@10 990 void X(cpy2d_tiled)(R *I, R *O,
Chris@10 991 INT n0, INT is0, INT os0,
Chris@10 992 INT n1, INT is1, INT os1,
Chris@10 993 INT vl);
Chris@10 994 void X(cpy2d_tiledbuf)(R *I, R *O,
Chris@10 995 INT n0, INT is0, INT os0,
Chris@10 996 INT n1, INT is1, INT os1,
Chris@10 997 INT vl);
Chris@10 998 void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
Chris@10 999 INT n0, INT is0, INT os0,
Chris@10 1000 INT n1, INT is1, INT os1);
Chris@10 1001 void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
Chris@10 1002 INT n0, INT is0, INT os0,
Chris@10 1003 INT n1, INT is1, INT os1);
Chris@10 1004 void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
Chris@10 1005 INT n0, INT is0, INT os0,
Chris@10 1006 INT n1, INT is1, INT os1);
Chris@10 1007
Chris@10 1008 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@10 1009 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@10 1010 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@10 1011
Chris@10 1012 typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@10 1013 typedef void (*cpy2d_func)(R *I, R *O,
Chris@10 1014 INT n0, INT is0, INT os0,
Chris@10 1015 INT n1, INT is1, INT os1,
Chris@10 1016 INT vl);
Chris@10 1017
Chris@10 1018 /*-----------------------------------------------------------------------*/
Chris@10 1019 /* misc stuff */
Chris@10 1020 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
Chris@10 1021 double X(iestimate_cost)(const planner *, const plan *, const problem *);
Chris@10 1022
Chris@10 1023 #ifdef FFTW_RANDOM_ESTIMATOR
Chris@10 1024 extern unsigned X(random_estimate_seed);
Chris@10 1025 #endif
Chris@10 1026
Chris@10 1027 double X(measure_execution_time)(const planner *plnr,
Chris@10 1028 plan *pln, const problem *p);
Chris@10 1029 int X(alignment_of)(R *p);
Chris@10 1030 unsigned X(hash)(const char *s);
Chris@10 1031 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
Chris@10 1032 int X(nbuf_redundant)(INT n, INT vl, int which,
Chris@10 1033 const INT *maxnbuf, int nmaxnbuf);
Chris@10 1034 INT X(bufdist)(INT n, INT vl);
Chris@10 1035 int X(toobig)(INT n);
Chris@10 1036 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
Chris@10 1037
Chris@10 1038 #if HAVE_SIMD
Chris@10 1039 R *X(taint)(R *p, INT s);
Chris@10 1040 R *X(join_taint)(R *p1, R *p2);
Chris@10 1041 #define TAINT(p, s) X(taint)(p, s)
Chris@10 1042 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
Chris@10 1043 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
Chris@10 1044 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
Chris@10 1045 #else
Chris@10 1046 #define TAINT(p, s) (p)
Chris@10 1047 #define UNTAINT(p) (p)
Chris@10 1048 #define TAINTOF(p) 0
Chris@10 1049 #define JOIN_TAINT(p1, p2) p1
Chris@10 1050 #endif
Chris@10 1051
Chris@10 1052 #ifdef FFTW_DEBUG_ALIGNMENT
Chris@10 1053 # define ASSERT_ALIGNED_DOUBLE { \
Chris@10 1054 double __foo; \
Chris@10 1055 CK(!(((uintptr_t) &__foo) & 0x7)); \
Chris@10 1056 }
Chris@10 1057 #else
Chris@10 1058 # define ASSERT_ALIGNED_DOUBLE
Chris@10 1059 #endif /* FFTW_DEBUG_ALIGNMENT */
Chris@10 1060
Chris@10 1061
Chris@10 1062
Chris@10 1063 /*-----------------------------------------------------------------------*/
Chris@10 1064 /* macros used in codelets to reduce source code size */
Chris@10 1065
Chris@10 1066 typedef R E; /* internal precision of codelets. */
Chris@10 1067
Chris@10 1068 #if defined(FFTW_LDOUBLE)
Chris@10 1069 # define K(x) ((E) x##L)
Chris@10 1070 #elif defined(FFTW_QUAD)
Chris@10 1071 # define K(x) ((E) x##Q)
Chris@10 1072 #else
Chris@10 1073 # define K(x) ((E) x)
Chris@10 1074 #endif
Chris@10 1075 #define DK(name, value) const E name = K(value)
Chris@10 1076
Chris@10 1077 /* FMA macros */
Chris@10 1078
Chris@10 1079 #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
Chris@10 1080 /* The obvious expression a * b + c does not work. If both x = a * b
Chris@10 1081 + c and y = a * b - c appear in the source, gcc computes t = a * b,
Chris@10 1082 x = t + c, y = t - c, thus destroying the fma.
Chris@10 1083
Chris@10 1084 This peculiar coding seems to do the right thing on all of
Chris@10 1085 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
Chris@10 1086 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
Chris@10 1087 `x' for the single-assignment form).
Chris@10 1088
Chris@10 1089 However, gcc-4.0 is a formidable adversary which succeeds in
Chris@10 1090 pessimizing two fma's into one multiplication and two additions.
Chris@10 1091 It does it very early in the game---before the optimization passes
Chris@10 1092 even start. The only real workaround seems to use fake inline asm
Chris@10 1093 such as
Chris@10 1094
Chris@10 1095 asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
Chris@10 1096 return a * b + c;
Chris@10 1097
Chris@10 1098 in each of the FMA, FMS, FNMA, and FNMS functions. However, this
Chris@10 1099 does not solve the problem either, because two equal asm statements
Chris@10 1100 count as a common subexpression! One must use *different* fake asm
Chris@10 1101 statements:
Chris@10 1102
Chris@10 1103 in FMA:
Chris@10 1104 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
Chris@10 1105
Chris@10 1106 in FMS:
Chris@10 1107 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
Chris@10 1108
Chris@10 1109 etc.
Chris@10 1110
Chris@10 1111 After these changes, gcc recalcitrantly generates the fma that was
Chris@10 1112 in the source to begin with. However, the extra asm() cruft
Chris@10 1113 confuses other passes of gcc, notably the instruction scheduler.
Chris@10 1114 (Of course, one could also generate the fma directly via inline
Chris@10 1115 asm, but this confuses the scheduler even more.)
Chris@10 1116
Chris@10 1117 Steven and I have submitted more than one bug report to the gcc
Chris@10 1118 mailing list over the past few years, to no effect. Thus, I give
Chris@10 1119 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
Chris@10 1120 out before touching this crap again.
Chris@10 1121 */
Chris@10 1122 static __inline__ E FMA(E a, E b, E c)
Chris@10 1123 {
Chris@10 1124 E x = a * b;
Chris@10 1125 x = x + c;
Chris@10 1126 return x;
Chris@10 1127 }
Chris@10 1128
Chris@10 1129 static __inline__ E FMS(E a, E b, E c)
Chris@10 1130 {
Chris@10 1131 E x = a * b;
Chris@10 1132 x = x - c;
Chris@10 1133 return x;
Chris@10 1134 }
Chris@10 1135
Chris@10 1136 static __inline__ E FNMA(E a, E b, E c)
Chris@10 1137 {
Chris@10 1138 E x = a * b;
Chris@10 1139 x = - (x + c);
Chris@10 1140 return x;
Chris@10 1141 }
Chris@10 1142
Chris@10 1143 static __inline__ E FNMS(E a, E b, E c)
Chris@10 1144 {
Chris@10 1145 E x = a * b;
Chris@10 1146 x = - (x - c);
Chris@10 1147 return x;
Chris@10 1148 }
Chris@10 1149 #else
Chris@10 1150 #define FMA(a, b, c) (((a) * (b)) + (c))
Chris@10 1151 #define FMS(a, b, c) (((a) * (b)) - (c))
Chris@10 1152 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
Chris@10 1153 #define FNMS(a, b, c) ((c) - ((a) * (b)))
Chris@10 1154 #endif
Chris@10 1155
Chris@10 1156 #ifdef __cplusplus
Chris@10 1157 } /* extern "C" */
Chris@10 1158 #endif /* __cplusplus */
Chris@10 1159
Chris@10 1160 #endif /* __IFFTW_H__ */