annotate src/fftw-3.3.8/kernel/ifftw.h @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21
Chris@82 22 /* FFTW internal header file */
Chris@82 23 #ifndef __IFFTW_H__
Chris@82 24 #define __IFFTW_H__
Chris@82 25
Chris@82 26 #include "config.h"
Chris@82 27
Chris@82 28 #include <stdlib.h> /* size_t */
Chris@82 29 #include <stdarg.h> /* va_list */
Chris@82 30 #include <stddef.h> /* ptrdiff_t */
Chris@82 31 #include <limits.h> /* INT_MAX */
Chris@82 32
Chris@82 33 #if HAVE_SYS_TYPES_H
Chris@82 34 # include <sys/types.h>
Chris@82 35 #endif
Chris@82 36
Chris@82 37 #if HAVE_STDINT_H
Chris@82 38 # include <stdint.h> /* uintptr_t, maybe */
Chris@82 39 #endif
Chris@82 40
Chris@82 41 #if HAVE_INTTYPES_H
Chris@82 42 # include <inttypes.h> /* uintptr_t, maybe */
Chris@82 43 #endif
Chris@82 44
Chris@82 45 #ifdef __cplusplus
Chris@82 46 extern "C"
Chris@82 47 {
Chris@82 48 #endif /* __cplusplus */
Chris@82 49
Chris@82 50 /* Windows annoyances -- since tests/hook.c uses some internal
Chris@82 51 FFTW functions, we need to given them the dllexport attribute
Chris@82 52 under Windows when compiling as a DLL (see api/fftw3.h). */
Chris@82 53 #if defined(FFTW_EXTERN)
Chris@82 54 # define IFFTW_EXTERN FFTW_EXTERN
Chris@82 55 #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
Chris@82 56 && (defined(_WIN32) || defined(__WIN32__))
Chris@82 57 # define IFFTW_EXTERN extern __declspec(dllexport)
Chris@82 58 #else
Chris@82 59 # define IFFTW_EXTERN extern
Chris@82 60 #endif
Chris@82 61
Chris@82 62 /* determine precision and name-mangling scheme */
Chris@82 63 #define CONCAT(prefix, name) prefix ## name
Chris@82 64 #if defined(FFTW_SINGLE)
Chris@82 65 typedef float R;
Chris@82 66 # define X(name) CONCAT(fftwf_, name)
Chris@82 67 #elif defined(FFTW_LDOUBLE)
Chris@82 68 typedef long double R;
Chris@82 69 # define X(name) CONCAT(fftwl_, name)
Chris@82 70 # define TRIGREAL_IS_LONG_DOUBLE
Chris@82 71 #elif defined(FFTW_QUAD)
Chris@82 72 typedef __float128 R;
Chris@82 73 # define X(name) CONCAT(fftwq_, name)
Chris@82 74 # define TRIGREAL_IS_QUAD
Chris@82 75 #else
Chris@82 76 typedef double R;
Chris@82 77 # define X(name) CONCAT(fftw_, name)
Chris@82 78 #endif
Chris@82 79
Chris@82 80 /*
Chris@82 81 integral type large enough to contain a stride (what ``int'' should
Chris@82 82 have been in the first place.
Chris@82 83 */
Chris@82 84 typedef ptrdiff_t INT;
Chris@82 85
Chris@82 86 /* dummy use of unused parameters to silence compiler warnings */
Chris@82 87 #define UNUSED(x) (void)x
Chris@82 88
Chris@82 89 #define NELEM(array) ((sizeof(array) / sizeof((array)[0])))
Chris@82 90
Chris@82 91 #define FFT_SIGN (-1) /* sign convention for forward transforms */
Chris@82 92 extern void X(extract_reim)(int sign, R *c, R **r, R **i);
Chris@82 93
Chris@82 94 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
Chris@82 95
Chris@82 96 #define STRINGIZEx(x) #x
Chris@82 97 #define STRINGIZE(x) STRINGIZEx(x)
Chris@82 98 #define CIMPLIES(ante, post) (!(ante) || (post))
Chris@82 99
Chris@82 100 /* define HAVE_SIMD if any simd extensions are supported */
Chris@82 101 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || \
Chris@82 102 defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \
Chris@82 103 defined(HAVE_AVX2) || defined(HAVE_AVX512) || \
Chris@82 104 defined(HAVE_KCVI) || \
Chris@82 105 defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \
Chris@82 106 defined(HAVE_MIPS_PS) || \
Chris@82 107 defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256)
Chris@82 108 #define HAVE_SIMD 1
Chris@82 109 #else
Chris@82 110 #define HAVE_SIMD 0
Chris@82 111 #endif
Chris@82 112
Chris@82 113 extern int X(have_simd_sse2)(void);
Chris@82 114 extern int X(have_simd_avx)(void);
Chris@82 115 extern int X(have_simd_avx_128_fma)(void);
Chris@82 116 extern int X(have_simd_avx2)(void);
Chris@82 117 extern int X(have_simd_avx2_128)(void);
Chris@82 118 extern int X(have_simd_avx512)(void);
Chris@82 119 extern int X(have_simd_altivec)(void);
Chris@82 120 extern int X(have_simd_vsx)(void);
Chris@82 121 extern int X(have_simd_neon)(void);
Chris@82 122
Chris@82 123 /* forward declarations */
Chris@82 124 typedef struct problem_s problem;
Chris@82 125 typedef struct plan_s plan;
Chris@82 126 typedef struct solver_s solver;
Chris@82 127 typedef struct planner_s planner;
Chris@82 128 typedef struct printer_s printer;
Chris@82 129 typedef struct scanner_s scanner;
Chris@82 130
Chris@82 131 /*-----------------------------------------------------------------------*/
Chris@82 132 /* alloca: */
Chris@82 133 #if HAVE_SIMD
Chris@82 134 # if defined(HAVE_KCVI) || defined(HAVE_AVX512)
Chris@82 135 # define MIN_ALIGNMENT 64
Chris@82 136 # elif defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_GENERIC_SIMD256)
Chris@82 137 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
Chris@82 138 * everything else */
Chris@82 139 # else
Chris@82 140 /* Note that we cannot use 32-byte alignment for all SIMD. For
Chris@82 141 example, MacOS X malloc is 16-byte aligned, but there was no
Chris@82 142 posix_memalign in MacOS X until version 10.6. */
Chris@82 143 # define MIN_ALIGNMENT 16
Chris@82 144 # endif
Chris@82 145 #endif
Chris@82 146
Chris@82 147 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
Chris@82 148 /* use alloca if available */
Chris@82 149
Chris@82 150 #ifndef alloca
Chris@82 151 #ifdef __GNUC__
Chris@82 152 # define alloca __builtin_alloca
Chris@82 153 #else
Chris@82 154 # ifdef _MSC_VER
Chris@82 155 # include <malloc.h>
Chris@82 156 # define alloca _alloca
Chris@82 157 # else
Chris@82 158 # if HAVE_ALLOCA_H
Chris@82 159 # include <alloca.h>
Chris@82 160 # else
Chris@82 161 # ifdef _AIX
Chris@82 162 #pragma alloca
Chris@82 163 # else
Chris@82 164 # ifndef alloca /* predefined by HP cc +Olibcalls */
Chris@82 165 void *alloca(size_t);
Chris@82 166 # endif
Chris@82 167 # endif
Chris@82 168 # endif
Chris@82 169 # endif
Chris@82 170 #endif
Chris@82 171 #endif
Chris@82 172
Chris@82 173 # ifdef MIN_ALIGNMENT
Chris@82 174 # define STACK_MALLOC(T, p, n) \
Chris@82 175 { \
Chris@82 176 p = (T)alloca((n) + MIN_ALIGNMENT); \
Chris@82 177 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
Chris@82 178 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \
Chris@82 179 }
Chris@82 180 # define STACK_FREE(n)
Chris@82 181 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
Chris@82 182 # define STACK_MALLOC(T, p, n) p = (T)alloca(n)
Chris@82 183 # define STACK_FREE(n)
Chris@82 184 # endif
Chris@82 185
Chris@82 186 #else /* ! HAVE_ALLOCA */
Chris@82 187 /* use malloc instead of alloca */
Chris@82 188 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
Chris@82 189 # define STACK_FREE(n) X(ifree)(n)
Chris@82 190 #endif /* ! HAVE_ALLOCA */
Chris@82 191
Chris@82 192 /* allocation of buffers. If these grow too large use malloc(), else
Chris@82 193 use STACK_MALLOC (hopefully reducing to alloca()). */
Chris@82 194
Chris@82 195 /* 64KiB ought to be enough for anybody */
Chris@82 196 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
Chris@82 197
Chris@82 198 #define BUF_ALLOC(T, p, n) \
Chris@82 199 { \
Chris@82 200 if (n < MAX_STACK_ALLOC) { \
Chris@82 201 STACK_MALLOC(T, p, n); \
Chris@82 202 } else { \
Chris@82 203 p = (T)MALLOC(n, BUFFERS); \
Chris@82 204 } \
Chris@82 205 }
Chris@82 206
Chris@82 207 #define BUF_FREE(p, n) \
Chris@82 208 { \
Chris@82 209 if (n < MAX_STACK_ALLOC) { \
Chris@82 210 STACK_FREE(p); \
Chris@82 211 } else { \
Chris@82 212 X(ifree)(p); \
Chris@82 213 } \
Chris@82 214 }
Chris@82 215
Chris@82 216 /*-----------------------------------------------------------------------*/
Chris@82 217 /* define uintptr_t if it is not already defined */
Chris@82 218
Chris@82 219 #ifndef HAVE_UINTPTR_T
Chris@82 220 # if SIZEOF_VOID_P == 0
Chris@82 221 # error sizeof void* is unknown!
Chris@82 222 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
Chris@82 223 typedef unsigned int uintptr_t;
Chris@82 224 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
Chris@82 225 typedef unsigned long uintptr_t;
Chris@82 226 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
Chris@82 227 typedef unsigned long long uintptr_t;
Chris@82 228 # else
Chris@82 229 # error no unsigned integer type matches void* sizeof!
Chris@82 230 # endif
Chris@82 231 #endif
Chris@82 232
Chris@82 233 /*-----------------------------------------------------------------------*/
Chris@82 234 /* We can do an optimization for copying pairs of (aligned) floats
Chris@82 235 when in single precision if 2*float = double. */
Chris@82 236
Chris@82 237 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
Chris@82 238 && SIZEOF_FLOAT != 0 \
Chris@82 239 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
Chris@82 240
Chris@82 241 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
Chris@82 242
Chris@82 243 /*-----------------------------------------------------------------------*/
Chris@82 244 /* assert.c: */
Chris@82 245 IFFTW_EXTERN void X(assertion_failed)(const char *s,
Chris@82 246 int line, const char *file);
Chris@82 247
Chris@82 248 /* always check */
Chris@82 249 #define CK(ex) \
Chris@82 250 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@82 251
Chris@82 252 #ifdef FFTW_DEBUG
Chris@82 253 /* check only if debug enabled */
Chris@82 254 #define A(ex) \
Chris@82 255 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@82 256 #else
Chris@82 257 #define A(ex) /* nothing */
Chris@82 258 #endif
Chris@82 259
Chris@82 260 extern void X(debug)(const char *format, ...);
Chris@82 261 #define D X(debug)
Chris@82 262
Chris@82 263 /*-----------------------------------------------------------------------*/
Chris@82 264 /* kalloc.c: */
Chris@82 265 extern void *X(kernel_malloc)(size_t n);
Chris@82 266 extern void X(kernel_free)(void *p);
Chris@82 267
Chris@82 268 /*-----------------------------------------------------------------------*/
Chris@82 269 /* alloc.c: */
Chris@82 270
Chris@82 271 /* objects allocated by malloc, for statistical purposes */
Chris@82 272 enum malloc_tag {
Chris@82 273 EVERYTHING,
Chris@82 274 PLANS,
Chris@82 275 SOLVERS,
Chris@82 276 PROBLEMS,
Chris@82 277 BUFFERS,
Chris@82 278 HASHT,
Chris@82 279 TENSORS,
Chris@82 280 PLANNERS,
Chris@82 281 SLVDESCS,
Chris@82 282 TWIDDLES,
Chris@82 283 STRIDES,
Chris@82 284 OTHER,
Chris@82 285 MALLOC_WHAT_LAST /* must be last */
Chris@82 286 };
Chris@82 287
Chris@82 288 IFFTW_EXTERN void X(ifree)(void *ptr);
Chris@82 289 extern void X(ifree0)(void *ptr);
Chris@82 290
Chris@82 291 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
Chris@82 292 #define MALLOC(n, what) X(malloc_plain)(n)
Chris@82 293
Chris@82 294 /*-----------------------------------------------------------------------*/
Chris@82 295 /* low-resolution clock */
Chris@82 296
Chris@82 297 #ifdef FAKE_CRUDE_TIME
Chris@82 298 typedef int crude_time;
Chris@82 299 #else
Chris@82 300 # if TIME_WITH_SYS_TIME
Chris@82 301 # include <sys/time.h>
Chris@82 302 # include <time.h>
Chris@82 303 # else
Chris@82 304 # if HAVE_SYS_TIME_H
Chris@82 305 # include <sys/time.h>
Chris@82 306 # else
Chris@82 307 # include <time.h>
Chris@82 308 # endif
Chris@82 309 # endif
Chris@82 310
Chris@82 311 # ifdef HAVE_BSDGETTIMEOFDAY
Chris@82 312 # ifndef HAVE_GETTIMEOFDAY
Chris@82 313 # define gettimeofday BSDgettimeofday
Chris@82 314 # define HAVE_GETTIMEOFDAY 1
Chris@82 315 # endif
Chris@82 316 # endif
Chris@82 317
Chris@82 318 # if defined(HAVE_GETTIMEOFDAY)
Chris@82 319 typedef struct timeval crude_time;
Chris@82 320 # else
Chris@82 321 typedef clock_t crude_time;
Chris@82 322 # endif
Chris@82 323 #endif /* else FAKE_CRUDE_TIME */
Chris@82 324
Chris@82 325 crude_time X(get_crude_time)(void);
Chris@82 326 double X(elapsed_since)(const planner *plnr, const problem *p,
Chris@82 327 crude_time t0); /* time in seconds since t0 */
Chris@82 328
Chris@82 329 /*-----------------------------------------------------------------------*/
Chris@82 330 /* ops.c: */
Chris@82 331 /*
Chris@82 332 * ops counter. The total number of additions is add + fma
Chris@82 333 * and the total number of multiplications is mul + fma.
Chris@82 334 * Total flops = add + mul + 2 * fma
Chris@82 335 */
Chris@82 336 typedef struct {
Chris@82 337 double add;
Chris@82 338 double mul;
Chris@82 339 double fma;
Chris@82 340 double other;
Chris@82 341 } opcnt;
Chris@82 342
Chris@82 343 void X(ops_zero)(opcnt *dst);
Chris@82 344 void X(ops_other)(INT o, opcnt *dst);
Chris@82 345 void X(ops_cpy)(const opcnt *src, opcnt *dst);
Chris@82 346
Chris@82 347 void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
Chris@82 348 void X(ops_add2)(const opcnt *a, opcnt *dst);
Chris@82 349
Chris@82 350 /* dst = m * a + b */
Chris@82 351 void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
Chris@82 352
Chris@82 353 /* dst += m * a */
Chris@82 354 void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
Chris@82 355
Chris@82 356
Chris@82 357 /*-----------------------------------------------------------------------*/
Chris@82 358 /* minmax.c: */
Chris@82 359 INT X(imax)(INT a, INT b);
Chris@82 360 INT X(imin)(INT a, INT b);
Chris@82 361
Chris@82 362 /*-----------------------------------------------------------------------*/
Chris@82 363 /* iabs.c: */
Chris@82 364 INT X(iabs)(INT a);
Chris@82 365
Chris@82 366 /* inline version */
Chris@82 367 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
Chris@82 368
Chris@82 369 /*-----------------------------------------------------------------------*/
Chris@82 370 /* md5.c */
Chris@82 371
Chris@82 372 #if SIZEOF_UNSIGNED_INT >= 4
Chris@82 373 typedef unsigned int md5uint;
Chris@82 374 #else
Chris@82 375 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
Chris@82 376 #endif
Chris@82 377
Chris@82 378 typedef md5uint md5sig[4];
Chris@82 379
Chris@82 380 typedef struct {
Chris@82 381 md5sig s; /* state and signature */
Chris@82 382
Chris@82 383 /* fields not meant to be used outside md5.c: */
Chris@82 384 unsigned char c[64]; /* stuff not yet processed */
Chris@82 385 unsigned l; /* total length. Should be 64 bits long, but this is
Chris@82 386 good enough for us */
Chris@82 387 } md5;
Chris@82 388
Chris@82 389 void X(md5begin)(md5 *p);
Chris@82 390 void X(md5putb)(md5 *p, const void *d_, size_t len);
Chris@82 391 void X(md5puts)(md5 *p, const char *s);
Chris@82 392 void X(md5putc)(md5 *p, unsigned char c);
Chris@82 393 void X(md5int)(md5 *p, int i);
Chris@82 394 void X(md5INT)(md5 *p, INT i);
Chris@82 395 void X(md5unsigned)(md5 *p, unsigned i);
Chris@82 396 void X(md5end)(md5 *p);
Chris@82 397
Chris@82 398 /*-----------------------------------------------------------------------*/
Chris@82 399 /* tensor.c: */
Chris@82 400 #define STRUCT_HACK_KR
Chris@82 401 #undef STRUCT_HACK_C99
Chris@82 402
Chris@82 403 typedef struct {
Chris@82 404 INT n;
Chris@82 405 INT is; /* input stride */
Chris@82 406 INT os; /* output stride */
Chris@82 407 } iodim;
Chris@82 408
Chris@82 409 typedef struct {
Chris@82 410 int rnk;
Chris@82 411 #if defined(STRUCT_HACK_KR)
Chris@82 412 iodim dims[1];
Chris@82 413 #elif defined(STRUCT_HACK_C99)
Chris@82 414 iodim dims[];
Chris@82 415 #else
Chris@82 416 iodim *dims;
Chris@82 417 #endif
Chris@82 418 } tensor;
Chris@82 419
Chris@82 420 /*
Chris@82 421 Definition of rank -infinity.
Chris@82 422 This definition has the property that if you want rank 0 or 1,
Chris@82 423 you can simply test for rank <= 1. This is a common case.
Chris@82 424
Chris@82 425 A tensor of rank -infinity has size 0.
Chris@82 426 */
Chris@82 427 #define RNK_MINFTY INT_MAX
Chris@82 428 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
Chris@82 429
Chris@82 430 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
Chris@82 431
Chris@82 432 tensor *X(mktensor)(int rnk);
Chris@82 433 tensor *X(mktensor_0d)(void);
Chris@82 434 tensor *X(mktensor_1d)(INT n, INT is, INT os);
Chris@82 435 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
Chris@82 436 INT n1, INT is1, INT os1);
Chris@82 437 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
Chris@82 438 INT n1, INT is1, INT os1,
Chris@82 439 INT n2, INT is2, INT os2);
Chris@82 440 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
Chris@82 441 INT n1, INT is1, INT os1,
Chris@82 442 INT n2, INT is2, INT os2,
Chris@82 443 INT n3, INT is3, INT os3);
Chris@82 444 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
Chris@82 445 INT n1, INT is1, INT os1,
Chris@82 446 INT n2, INT is2, INT os2,
Chris@82 447 INT n3, INT is3, INT os3,
Chris@82 448 INT n4, INT is4, INT os4);
Chris@82 449 INT X(tensor_sz)(const tensor *sz);
Chris@82 450 void X(tensor_md5)(md5 *p, const tensor *t);
Chris@82 451 INT X(tensor_max_index)(const tensor *sz);
Chris@82 452 INT X(tensor_min_istride)(const tensor *sz);
Chris@82 453 INT X(tensor_min_ostride)(const tensor *sz);
Chris@82 454 INT X(tensor_min_stride)(const tensor *sz);
Chris@82 455 int X(tensor_inplace_strides)(const tensor *sz);
Chris@82 456 int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
Chris@82 457 int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
Chris@82 458 inplace_kind k);
Chris@82 459 tensor *X(tensor_copy)(const tensor *sz);
Chris@82 460 int X(tensor_kosherp)(const tensor *x);
Chris@82 461
Chris@82 462 tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
Chris@82 463 tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
Chris@82 464 tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
Chris@82 465 tensor *X(tensor_compress)(const tensor *sz);
Chris@82 466 tensor *X(tensor_compress_contiguous)(const tensor *sz);
Chris@82 467 tensor *X(tensor_append)(const tensor *a, const tensor *b);
Chris@82 468 void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
Chris@82 469 int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
Chris@82 470 void X(tensor_destroy)(tensor *sz);
Chris@82 471 void X(tensor_destroy2)(tensor *a, tensor *b);
Chris@82 472 void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
Chris@82 473 void X(tensor_print)(const tensor *sz, printer *p);
Chris@82 474 int X(dimcmp)(const iodim *a, const iodim *b);
Chris@82 475 int X(tensor_equal)(const tensor *a, const tensor *b);
Chris@82 476 int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
Chris@82 477
Chris@82 478 /*-----------------------------------------------------------------------*/
Chris@82 479 /* problem.c: */
Chris@82 480 enum {
Chris@82 481 /* a problem that cannot be solved */
Chris@82 482 PROBLEM_UNSOLVABLE,
Chris@82 483
Chris@82 484 PROBLEM_DFT,
Chris@82 485 PROBLEM_RDFT,
Chris@82 486 PROBLEM_RDFT2,
Chris@82 487
Chris@82 488 /* for mpi/ subdirectory */
Chris@82 489 PROBLEM_MPI_DFT,
Chris@82 490 PROBLEM_MPI_RDFT,
Chris@82 491 PROBLEM_MPI_RDFT2,
Chris@82 492 PROBLEM_MPI_TRANSPOSE,
Chris@82 493
Chris@82 494 PROBLEM_LAST
Chris@82 495 };
Chris@82 496
Chris@82 497 typedef struct {
Chris@82 498 int problem_kind;
Chris@82 499 void (*hash) (const problem *ego, md5 *p);
Chris@82 500 void (*zero) (const problem *ego);
Chris@82 501 void (*print) (const problem *ego, printer *p);
Chris@82 502 void (*destroy) (problem *ego);
Chris@82 503 } problem_adt;
Chris@82 504
Chris@82 505 struct problem_s {
Chris@82 506 const problem_adt *adt;
Chris@82 507 };
Chris@82 508
Chris@82 509 problem *X(mkproblem)(size_t sz, const problem_adt *adt);
Chris@82 510 void X(problem_destroy)(problem *ego);
Chris@82 511 problem *X(mkproblem_unsolvable)(void);
Chris@82 512
Chris@82 513 /*-----------------------------------------------------------------------*/
Chris@82 514 /* print.c */
Chris@82 515 struct printer_s {
Chris@82 516 void (*print)(printer *p, const char *format, ...);
Chris@82 517 void (*vprint)(printer *p, const char *format, va_list ap);
Chris@82 518 void (*putchr)(printer *p, char c);
Chris@82 519 void (*cleanup)(printer *p);
Chris@82 520 int indent;
Chris@82 521 int indent_incr;
Chris@82 522 };
Chris@82 523
Chris@82 524 printer *X(mkprinter)(size_t size,
Chris@82 525 void (*putchr)(printer *p, char c),
Chris@82 526 void (*cleanup)(printer *p));
Chris@82 527 IFFTW_EXTERN void X(printer_destroy)(printer *p);
Chris@82 528
Chris@82 529 /*-----------------------------------------------------------------------*/
Chris@82 530 /* scan.c */
Chris@82 531 struct scanner_s {
Chris@82 532 int (*scan)(scanner *sc, const char *format, ...);
Chris@82 533 int (*vscan)(scanner *sc, const char *format, va_list ap);
Chris@82 534 int (*getchr)(scanner *sc);
Chris@82 535 int ungotc;
Chris@82 536 };
Chris@82 537
Chris@82 538 scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
Chris@82 539 void X(scanner_destroy)(scanner *sc);
Chris@82 540
Chris@82 541 /*-----------------------------------------------------------------------*/
Chris@82 542 /* plan.c: */
Chris@82 543
Chris@82 544 enum wakefulness {
Chris@82 545 SLEEPY,
Chris@82 546 AWAKE_ZERO,
Chris@82 547 AWAKE_SQRTN_TABLE,
Chris@82 548 AWAKE_SINCOS
Chris@82 549 };
Chris@82 550
Chris@82 551 typedef struct {
Chris@82 552 void (*solve)(const plan *ego, const problem *p);
Chris@82 553 void (*awake)(plan *ego, enum wakefulness wakefulness);
Chris@82 554 void (*print)(const plan *ego, printer *p);
Chris@82 555 void (*destroy)(plan *ego);
Chris@82 556 } plan_adt;
Chris@82 557
Chris@82 558 struct plan_s {
Chris@82 559 const plan_adt *adt;
Chris@82 560 opcnt ops;
Chris@82 561 double pcost;
Chris@82 562 enum wakefulness wakefulness; /* used for debugging only */
Chris@82 563 int could_prune_now_p;
Chris@82 564 };
Chris@82 565
Chris@82 566 plan *X(mkplan)(size_t size, const plan_adt *adt);
Chris@82 567 void X(plan_destroy_internal)(plan *ego);
Chris@82 568 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
Chris@82 569 void X(plan_null_destroy)(plan *ego);
Chris@82 570
Chris@82 571 /*-----------------------------------------------------------------------*/
Chris@82 572 /* solver.c: */
Chris@82 573 typedef struct {
Chris@82 574 int problem_kind;
Chris@82 575 plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
Chris@82 576 void (*destroy)(solver *ego);
Chris@82 577 } solver_adt;
Chris@82 578
Chris@82 579 struct solver_s {
Chris@82 580 const solver_adt *adt;
Chris@82 581 int refcnt;
Chris@82 582 };
Chris@82 583
Chris@82 584 solver *X(mksolver)(size_t size, const solver_adt *adt);
Chris@82 585 void X(solver_use)(solver *ego);
Chris@82 586 void X(solver_destroy)(solver *ego);
Chris@82 587 void X(solver_register)(planner *plnr, solver *s);
Chris@82 588
Chris@82 589 /* shorthand */
Chris@82 590 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
Chris@82 591
Chris@82 592 /*-----------------------------------------------------------------------*/
Chris@82 593 /* planner.c */
Chris@82 594
Chris@82 595 typedef struct slvdesc_s {
Chris@82 596 solver *slv;
Chris@82 597 const char *reg_nam;
Chris@82 598 unsigned nam_hash;
Chris@82 599 int reg_id;
Chris@82 600 int next_for_same_problem_kind;
Chris@82 601 } slvdesc;
Chris@82 602
Chris@82 603 typedef struct solution_s solution; /* opaque */
Chris@82 604
Chris@82 605 /* interpretation of L and U:
Chris@82 606
Chris@82 607 - if it returns a plan, the planner guarantees that all applicable
Chris@82 608 plans at least as impatient as U have been tried, and that each
Chris@82 609 plan in the solution is at least as impatient as L.
Chris@82 610
Chris@82 611 - if it returns 0, the planner guarantees to have tried all solvers
Chris@82 612 at least as impatient as L, and that none of them was applicable.
Chris@82 613
Chris@82 614 The structure is packed to fit into 64 bits.
Chris@82 615 */
Chris@82 616
Chris@82 617 typedef struct {
Chris@82 618 unsigned l:20;
Chris@82 619 unsigned hash_info:3;
Chris@82 620 # define BITS_FOR_TIMELIMIT 9
Chris@82 621 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
Chris@82 622 unsigned u:20;
Chris@82 623
Chris@82 624 /* abstraction break: we store the solver here to pad the
Chris@82 625 structure to 64 bits. Otherwise, the struct is padded to 64
Chris@82 626 bits anyway, and another word is allocated for slvndx. */
Chris@82 627 # define BITS_FOR_SLVNDX 12
Chris@82 628 unsigned slvndx:BITS_FOR_SLVNDX;
Chris@82 629 } flags_t;
Chris@82 630
Chris@82 631 /* impatience flags */
Chris@82 632 enum {
Chris@82 633 BELIEVE_PCOST = 0x0001,
Chris@82 634 ESTIMATE = 0x0002,
Chris@82 635 NO_DFT_R2HC = 0x0004,
Chris@82 636 NO_SLOW = 0x0008,
Chris@82 637 NO_VRECURSE = 0x0010,
Chris@82 638 NO_INDIRECT_OP = 0x0020,
Chris@82 639 NO_LARGE_GENERIC = 0x0040,
Chris@82 640 NO_RANK_SPLITS = 0x0080,
Chris@82 641 NO_VRANK_SPLITS = 0x0100,
Chris@82 642 NO_NONTHREADED = 0x0200,
Chris@82 643 NO_BUFFERING = 0x0400,
Chris@82 644 NO_FIXED_RADIX_LARGE_N = 0x0800,
Chris@82 645 NO_DESTROY_INPUT = 0x1000,
Chris@82 646 NO_SIMD = 0x2000,
Chris@82 647 CONSERVE_MEMORY = 0x4000,
Chris@82 648 NO_DHT_R2HC = 0x8000,
Chris@82 649 NO_UGLY = 0x10000,
Chris@82 650 ALLOW_PRUNING = 0x20000
Chris@82 651 };
Chris@82 652
Chris@82 653 /* hashtable information */
Chris@82 654 enum {
Chris@82 655 BLESSING = 0x1u, /* save this entry */
Chris@82 656 H_VALID = 0x2u, /* valid hastable entry */
Chris@82 657 H_LIVE = 0x4u /* entry is nonempty, implies H_VALID */
Chris@82 658 };
Chris@82 659
Chris@82 660 #define PLNR_L(plnr) ((plnr)->flags.l)
Chris@82 661 #define PLNR_U(plnr) ((plnr)->flags.u)
Chris@82 662 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
Chris@82 663
Chris@82 664 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
Chris@82 665 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
Chris@82 666 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
Chris@82 667
Chris@82 668 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
Chris@82 669 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
Chris@82 670 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
Chris@82 671 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
Chris@82 672 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
Chris@82 673 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
Chris@82 674 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
Chris@82 675 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
Chris@82 676 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
Chris@82 677 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
Chris@82 678 #define NO_NONTHREADEDP(plnr) \
Chris@82 679 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
Chris@82 680
Chris@82 681 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
Chris@82 682 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
Chris@82 683 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
Chris@82 684 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
Chris@82 685 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
Chris@82 686
Chris@82 687 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
Chris@82 688
Chris@82 689 typedef enum {
Chris@82 690 /* WISDOM_NORMAL: planner may or may not use wisdom */
Chris@82 691 WISDOM_NORMAL,
Chris@82 692
Chris@82 693 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
Chris@82 694 WISDOM_ONLY,
Chris@82 695
Chris@82 696 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
Chris@82 697 WISDOM_IS_BOGUS,
Chris@82 698
Chris@82 699 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
Chris@82 700 WISDOM_IGNORE_INFEASIBLE,
Chris@82 701
Chris@82 702 /* WISDOM_IGNORE_ALL: planner ignores all */
Chris@82 703 WISDOM_IGNORE_ALL
Chris@82 704 } wisdom_state_t;
Chris@82 705
Chris@82 706 typedef struct {
Chris@82 707 void (*register_solver)(planner *ego, solver *s);
Chris@82 708 plan *(*mkplan)(planner *ego, const problem *p);
Chris@82 709 void (*forget)(planner *ego, amnesia a);
Chris@82 710 void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
Chris@82 711 word in C++. */
Chris@82 712 int (*imprt)(planner *ego, scanner *sc);
Chris@82 713 } planner_adt;
Chris@82 714
Chris@82 715 /* hash table of solutions */
Chris@82 716 typedef struct {
Chris@82 717 solution *solutions;
Chris@82 718 unsigned hashsiz, nelem;
Chris@82 719
Chris@82 720 /* statistics */
Chris@82 721 int lookup, succ_lookup, lookup_iter;
Chris@82 722 int insert, insert_iter, insert_unknown;
Chris@82 723 int nrehash;
Chris@82 724 } hashtab;
Chris@82 725
Chris@82 726 typedef enum { COST_SUM, COST_MAX } cost_kind;
Chris@82 727
Chris@82 728 struct planner_s {
Chris@82 729 const planner_adt *adt;
Chris@82 730 void (*hook)(struct planner_s *plnr, plan *pln,
Chris@82 731 const problem *p, int optimalp);
Chris@82 732 double (*cost_hook)(const problem *p, double t, cost_kind k);
Chris@82 733 int (*wisdom_ok_hook)(const problem *p, flags_t flags);
Chris@82 734 void (*nowisdom_hook)(const problem *p);
Chris@82 735 wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
Chris@82 736
Chris@82 737 /* solver descriptors */
Chris@82 738 slvdesc *slvdescs;
Chris@82 739 unsigned nslvdesc, slvdescsiz;
Chris@82 740 const char *cur_reg_nam;
Chris@82 741 int cur_reg_id;
Chris@82 742 int slvdescs_for_problem_kind[PROBLEM_LAST];
Chris@82 743
Chris@82 744 wisdom_state_t wisdom_state;
Chris@82 745
Chris@82 746 hashtab htab_blessed;
Chris@82 747 hashtab htab_unblessed;
Chris@82 748
Chris@82 749 int nthr;
Chris@82 750 flags_t flags;
Chris@82 751
Chris@82 752 crude_time start_time;
Chris@82 753 double timelimit; /* elapsed_since(start_time) at which to bail out */
Chris@82 754 int timed_out; /* whether most recent search timed out */
Chris@82 755 int need_timeout_check;
Chris@82 756
Chris@82 757 /* various statistics */
Chris@82 758 int nplan; /* number of plans evaluated */
Chris@82 759 double pcost, epcost; /* total pcost of measured/estimated plans */
Chris@82 760 int nprob; /* number of problems evaluated */
Chris@82 761 };
Chris@82 762
Chris@82 763 planner *X(mkplanner)(void);
Chris@82 764 void X(planner_destroy)(planner *ego);
Chris@82 765
Chris@82 766 /*
Chris@82 767 Iterate over all solvers. Read:
Chris@82 768
Chris@82 769 @article{ baker93iterators,
Chris@82 770 author = "Henry G. Baker, Jr.",
Chris@82 771 title = "Iterators: Signs of Weakness in Object-Oriented Languages",
Chris@82 772 journal = "{ACM} {OOPS} Messenger",
Chris@82 773 volume = "4",
Chris@82 774 number = "3",
Chris@82 775 pages = "18--25"
Chris@82 776 }
Chris@82 777 */
Chris@82 778 #define FORALL_SOLVERS(ego, s, p, what) \
Chris@82 779 { \
Chris@82 780 unsigned _cnt; \
Chris@82 781 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
Chris@82 782 slvdesc *p = ego->slvdescs + _cnt; \
Chris@82 783 solver *s = p->slv; \
Chris@82 784 what; \
Chris@82 785 } \
Chris@82 786 }
Chris@82 787
Chris@82 788 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
Chris@82 789 { \
Chris@82 790 int _cnt = ego->slvdescs_for_problem_kind[kind]; \
Chris@82 791 while (_cnt >= 0) { \
Chris@82 792 slvdesc *p = ego->slvdescs + _cnt; \
Chris@82 793 solver *s = p->slv; \
Chris@82 794 what; \
Chris@82 795 _cnt = p->next_for_same_problem_kind; \
Chris@82 796 } \
Chris@82 797 }
Chris@82 798
Chris@82 799
Chris@82 800 /* make plan, destroy problem */
Chris@82 801 plan *X(mkplan_d)(planner *ego, problem *p);
Chris@82 802 plan *X(mkplan_f_d)(planner *ego, problem *p,
Chris@82 803 unsigned l_set, unsigned u_set, unsigned u_reset);
Chris@82 804
Chris@82 805 /*-----------------------------------------------------------------------*/
Chris@82 806 /* stride.c: */
Chris@82 807
Chris@82 808 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
Chris@82 809 #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
Chris@82 810 #define PRECOMPUTE_ARRAY_INDICES
Chris@82 811 #endif
Chris@82 812
Chris@82 813 extern const INT X(an_INT_guaranteed_to_be_zero);
Chris@82 814
Chris@82 815 #ifdef PRECOMPUTE_ARRAY_INDICES
Chris@82 816 typedef INT *stride;
Chris@82 817 #define WS(stride, i) (stride[i])
Chris@82 818 extern stride X(mkstride)(INT n, INT s);
Chris@82 819 void X(stride_destroy)(stride p);
Chris@82 820 /* hackery to prevent the compiler from copying the strides array
Chris@82 821 onto the stack */
Chris@82 822 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
Chris@82 823 #else
Chris@82 824
Chris@82 825 typedef INT stride;
Chris@82 826 #define WS(stride, i) (stride * i)
Chris@82 827 #define fftwf_mkstride(n, stride) stride
Chris@82 828 #define fftw_mkstride(n, stride) stride
Chris@82 829 #define fftwl_mkstride(n, stride) stride
Chris@82 830 #define fftwf_stride_destroy(p) ((void) p)
Chris@82 831 #define fftw_stride_destroy(p) ((void) p)
Chris@82 832 #define fftwl_stride_destroy(p) ((void) p)
Chris@82 833
Chris@82 834 /* hackery to prevent the compiler from ``optimizing'' induction
Chris@82 835 variables in codelet loops. The problem is that for each K and for
Chris@82 836 each expression of the form P[I + STRIDE * K] in a loop, most
Chris@82 837 compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
Chris@82 838 For large values of K this behavior overflows the
Chris@82 839 register set, which is likely worse than doing the index computation
Chris@82 840 in the first place.
Chris@82 841
Chris@82 842 If we guess that there are more than
Chris@82 843 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
Chris@82 844 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
Chris@82 845 be 0, but the compiler does not know this.
Chris@82 846
Chris@82 847 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
Chris@82 848 seem to imply.
Chris@82 849 */
Chris@82 850 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
Chris@82 851 #define MAKE_VOLATILE_STRIDE(nptr, x) \
Chris@82 852 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
Chris@82 853 0 : \
Chris@82 854 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
Chris@82 855 #endif /* PRECOMPUTE_ARRAY_INDICES */
Chris@82 856
Chris@82 857 /*-----------------------------------------------------------------------*/
Chris@82 858 /* solvtab.c */
Chris@82 859
Chris@82 860 struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
Chris@82 861 typedef struct solvtab_s solvtab[];
Chris@82 862 void X(solvtab_exec)(const solvtab tbl, planner *p);
Chris@82 863 #define SOLVTAB(s) { s, STRINGIZE(s) }
Chris@82 864 #define SOLVTAB_END { 0, 0 }
Chris@82 865
Chris@82 866 /*-----------------------------------------------------------------------*/
Chris@82 867 /* pickdim.c */
Chris@82 868 int X(pickdim)(int which_dim, const int *buddies, size_t nbuddies,
Chris@82 869 const tensor *sz, int oop, int *dp);
Chris@82 870
Chris@82 871 /*-----------------------------------------------------------------------*/
Chris@82 872 /* twiddle.c */
Chris@82 873 /* little language to express twiddle factors computation */
Chris@82 874 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
Chris@82 875 TW_FULL = 4, TW_HALF = 5 };
Chris@82 876
Chris@82 877 typedef struct {
Chris@82 878 unsigned char op;
Chris@82 879 signed char v;
Chris@82 880 short i;
Chris@82 881 } tw_instr;
Chris@82 882
Chris@82 883 typedef struct twid_s {
Chris@82 884 R *W; /* array of twiddle factors */
Chris@82 885 INT n, r, m; /* transform order, radix, # twiddle rows */
Chris@82 886 int refcnt;
Chris@82 887 const tw_instr *instr;
Chris@82 888 struct twid_s *cdr;
Chris@82 889 enum wakefulness wakefulness;
Chris@82 890 } twid;
Chris@82 891
Chris@82 892 INT X(twiddle_length)(INT r, const tw_instr *p);
Chris@82 893 void X(twiddle_awake)(enum wakefulness wakefulness,
Chris@82 894 twid **pp, const tw_instr *instr, INT n, INT r, INT m);
Chris@82 895
Chris@82 896 /*-----------------------------------------------------------------------*/
Chris@82 897 /* trig.c */
Chris@82 898 #if defined(TRIGREAL_IS_LONG_DOUBLE)
Chris@82 899 typedef long double trigreal;
Chris@82 900 #elif defined(TRIGREAL_IS_QUAD)
Chris@82 901 typedef __float128 trigreal;
Chris@82 902 #else
Chris@82 903 typedef double trigreal;
Chris@82 904 #endif
Chris@82 905
Chris@82 906 typedef struct triggen_s triggen;
Chris@82 907
Chris@82 908 struct triggen_s {
Chris@82 909 void (*cexp)(triggen *t, INT m, R *result);
Chris@82 910 void (*cexpl)(triggen *t, INT m, trigreal *result);
Chris@82 911 void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
Chris@82 912
Chris@82 913 INT twshft;
Chris@82 914 INT twradix;
Chris@82 915 INT twmsk;
Chris@82 916 trigreal *W0, *W1;
Chris@82 917 INT n;
Chris@82 918 };
Chris@82 919
Chris@82 920 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
Chris@82 921 void X(triggen_destroy)(triggen *p);
Chris@82 922
Chris@82 923 /*-----------------------------------------------------------------------*/
Chris@82 924 /* primes.c: */
Chris@82 925
Chris@82 926 #define MULMOD(x, y, p) \
Chris@82 927 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
Chris@82 928
Chris@82 929 INT X(safe_mulmod)(INT x, INT y, INT p);
Chris@82 930 INT X(power_mod)(INT n, INT m, INT p);
Chris@82 931 INT X(find_generator)(INT p);
Chris@82 932 INT X(first_divisor)(INT n);
Chris@82 933 int X(is_prime)(INT n);
Chris@82 934 INT X(next_prime)(INT n);
Chris@82 935 int X(factors_into)(INT n, const INT *primes);
Chris@82 936 int X(factors_into_small_primes)(INT n);
Chris@82 937 INT X(choose_radix)(INT r, INT n);
Chris@82 938 INT X(isqrt)(INT n);
Chris@82 939 INT X(modulo)(INT a, INT n);
Chris@82 940
Chris@82 941 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
Chris@82 942
Chris@82 943 /* thresholds below which certain solvers are considered SLOW. These are guesses
Chris@82 944 believed to be conservative */
Chris@82 945 #define GENERIC_MAX_SLOW 16
Chris@82 946 #define RADER_MAX_SLOW 32
Chris@82 947 #define BLUESTEIN_MAX_SLOW 24
Chris@82 948
Chris@82 949 /*-----------------------------------------------------------------------*/
Chris@82 950 /* rader.c: */
Chris@82 951 typedef struct rader_tls rader_tl;
Chris@82 952
Chris@82 953 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
Chris@82 954 R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
Chris@82 955 void X(rader_tl_delete)(R *W, rader_tl **tl);
Chris@82 956
Chris@82 957 /*-----------------------------------------------------------------------*/
Chris@82 958 /* copy/transposition routines */
Chris@82 959
Chris@82 960 /* lower bound to the cache size, for tiled routines */
Chris@82 961 #define CACHESIZE 8192
Chris@82 962
Chris@82 963 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
Chris@82 964
Chris@82 965 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
Chris@82 966 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
Chris@82 967 void *args);
Chris@82 968 void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
Chris@82 969 void X(zero1d_pair)(R *O0, R *O1, INT n0, INT os0);
Chris@82 970 void X(cpy2d)(R *I, R *O,
Chris@82 971 INT n0, INT is0, INT os0,
Chris@82 972 INT n1, INT is1, INT os1,
Chris@82 973 INT vl);
Chris@82 974 void X(cpy2d_ci)(R *I, R *O,
Chris@82 975 INT n0, INT is0, INT os0,
Chris@82 976 INT n1, INT is1, INT os1,
Chris@82 977 INT vl);
Chris@82 978 void X(cpy2d_co)(R *I, R *O,
Chris@82 979 INT n0, INT is0, INT os0,
Chris@82 980 INT n1, INT is1, INT os1,
Chris@82 981 INT vl);
Chris@82 982 void X(cpy2d_tiled)(R *I, R *O,
Chris@82 983 INT n0, INT is0, INT os0,
Chris@82 984 INT n1, INT is1, INT os1,
Chris@82 985 INT vl);
Chris@82 986 void X(cpy2d_tiledbuf)(R *I, R *O,
Chris@82 987 INT n0, INT is0, INT os0,
Chris@82 988 INT n1, INT is1, INT os1,
Chris@82 989 INT vl);
Chris@82 990 void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
Chris@82 991 INT n0, INT is0, INT os0,
Chris@82 992 INT n1, INT is1, INT os1);
Chris@82 993 void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
Chris@82 994 INT n0, INT is0, INT os0,
Chris@82 995 INT n1, INT is1, INT os1);
Chris@82 996 void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
Chris@82 997 INT n0, INT is0, INT os0,
Chris@82 998 INT n1, INT is1, INT os1);
Chris@82 999
Chris@82 1000 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@82 1001 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@82 1002 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@82 1003
Chris@82 1004 typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@82 1005 typedef void (*cpy2d_func)(R *I, R *O,
Chris@82 1006 INT n0, INT is0, INT os0,
Chris@82 1007 INT n1, INT is1, INT os1,
Chris@82 1008 INT vl);
Chris@82 1009
Chris@82 1010 /*-----------------------------------------------------------------------*/
Chris@82 1011 /* misc stuff */
Chris@82 1012 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
Chris@82 1013 double X(iestimate_cost)(const planner *, const plan *, const problem *);
Chris@82 1014
Chris@82 1015 #ifdef FFTW_RANDOM_ESTIMATOR
Chris@82 1016 extern unsigned X(random_estimate_seed);
Chris@82 1017 #endif
Chris@82 1018
Chris@82 1019 double X(measure_execution_time)(const planner *plnr,
Chris@82 1020 plan *pln, const problem *p);
Chris@82 1021 IFFTW_EXTERN int X(ialignment_of)(R *p);
Chris@82 1022 unsigned X(hash)(const char *s);
Chris@82 1023 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
Chris@82 1024 int X(nbuf_redundant)(INT n, INT vl, size_t which,
Chris@82 1025 const INT *maxnbuf, size_t nmaxnbuf);
Chris@82 1026 INT X(bufdist)(INT n, INT vl);
Chris@82 1027 int X(toobig)(INT n);
Chris@82 1028 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
Chris@82 1029
Chris@82 1030 #if HAVE_SIMD
Chris@82 1031 R *X(taint)(R *p, INT s);
Chris@82 1032 R *X(join_taint)(R *p1, R *p2);
Chris@82 1033 #define TAINT(p, s) X(taint)(p, s)
Chris@82 1034 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
Chris@82 1035 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
Chris@82 1036 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
Chris@82 1037 #else
Chris@82 1038 #define TAINT(p, s) (p)
Chris@82 1039 #define UNTAINT(p) (p)
Chris@82 1040 #define TAINTOF(p) 0
Chris@82 1041 #define JOIN_TAINT(p1, p2) p1
Chris@82 1042 #endif
Chris@82 1043
Chris@82 1044 #define ASSERT_ALIGNED_DOUBLE /*unused, legacy*/
Chris@82 1045
Chris@82 1046 /*-----------------------------------------------------------------------*/
Chris@82 1047 /* macros used in codelets to reduce source code size */
Chris@82 1048
Chris@82 1049 typedef R E; /* internal precision of codelets. */
Chris@82 1050
Chris@82 1051 #if defined(FFTW_LDOUBLE)
Chris@82 1052 # define K(x) ((E) x##L)
Chris@82 1053 #elif defined(FFTW_QUAD)
Chris@82 1054 # define K(x) ((E) x##Q)
Chris@82 1055 #else
Chris@82 1056 # define K(x) ((E) x)
Chris@82 1057 #endif
Chris@82 1058 #define DK(name, value) const E name = K(value)
Chris@82 1059
Chris@82 1060 /* FMA macros */
Chris@82 1061
Chris@82 1062 #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
Chris@82 1063 /* The obvious expression a * b + c does not work. If both x = a * b
Chris@82 1064 + c and y = a * b - c appear in the source, gcc computes t = a * b,
Chris@82 1065 x = t + c, y = t - c, thus destroying the fma.
Chris@82 1066
Chris@82 1067 This peculiar coding seems to do the right thing on all of
Chris@82 1068 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
Chris@82 1069 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
Chris@82 1070 `x' for the single-assignment form).
Chris@82 1071
Chris@82 1072 However, gcc-4.0 is a formidable adversary which succeeds in
Chris@82 1073 pessimizing two fma's into one multiplication and two additions.
Chris@82 1074 It does it very early in the game---before the optimization passes
Chris@82 1075 even start. The only real workaround seems to use fake inline asm
Chris@82 1076 such as
Chris@82 1077
Chris@82 1078 asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
Chris@82 1079 return a * b + c;
Chris@82 1080
Chris@82 1081 in each of the FMA, FMS, FNMA, and FNMS functions. However, this
Chris@82 1082 does not solve the problem either, because two equal asm statements
Chris@82 1083 count as a common subexpression! One must use *different* fake asm
Chris@82 1084 statements:
Chris@82 1085
Chris@82 1086 in FMA:
Chris@82 1087 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
Chris@82 1088
Chris@82 1089 in FMS:
Chris@82 1090 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
Chris@82 1091
Chris@82 1092 etc.
Chris@82 1093
Chris@82 1094 After these changes, gcc recalcitrantly generates the fma that was
Chris@82 1095 in the source to begin with. However, the extra asm() cruft
Chris@82 1096 confuses other passes of gcc, notably the instruction scheduler.
Chris@82 1097 (Of course, one could also generate the fma directly via inline
Chris@82 1098 asm, but this confuses the scheduler even more.)
Chris@82 1099
Chris@82 1100 Steven and I have submitted more than one bug report to the gcc
Chris@82 1101 mailing list over the past few years, to no effect. Thus, I give
Chris@82 1102 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
Chris@82 1103 out before touching this crap again.
Chris@82 1104 */
Chris@82 1105 static __inline__ E FMA(E a, E b, E c)
Chris@82 1106 {
Chris@82 1107 E x = a * b;
Chris@82 1108 x = x + c;
Chris@82 1109 return x;
Chris@82 1110 }
Chris@82 1111
Chris@82 1112 static __inline__ E FMS(E a, E b, E c)
Chris@82 1113 {
Chris@82 1114 E x = a * b;
Chris@82 1115 x = x - c;
Chris@82 1116 return x;
Chris@82 1117 }
Chris@82 1118
Chris@82 1119 static __inline__ E FNMA(E a, E b, E c)
Chris@82 1120 {
Chris@82 1121 E x = a * b;
Chris@82 1122 x = - (x + c);
Chris@82 1123 return x;
Chris@82 1124 }
Chris@82 1125
Chris@82 1126 static __inline__ E FNMS(E a, E b, E c)
Chris@82 1127 {
Chris@82 1128 E x = a * b;
Chris@82 1129 x = - (x - c);
Chris@82 1130 return x;
Chris@82 1131 }
Chris@82 1132 #else
Chris@82 1133 #define FMA(a, b, c) (((a) * (b)) + (c))
Chris@82 1134 #define FMS(a, b, c) (((a) * (b)) - (c))
Chris@82 1135 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
Chris@82 1136 #define FNMS(a, b, c) ((c) - ((a) * (b)))
Chris@82 1137 #endif
Chris@82 1138
Chris@82 1139 #ifdef __cplusplus
Chris@82 1140 } /* extern "C" */
Chris@82 1141 #endif /* __cplusplus */
Chris@82 1142
Chris@82 1143 #endif /* __IFFTW_H__ */