annotate fft/fftw/fftw-3.3.4/kernel/ifftw.h @ 40:223f770b5341 kissfft-double tip

Try a double-precision kissfft
author Chris Cannam
date Wed, 07 Sep 2016 10:40:32 +0100
parents 26056e866c29
children
rev   line source
Chris@19 1 /*
Chris@19 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@19 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@19 4 *
Chris@19 5 * This program is free software; you can redistribute it and/or modify
Chris@19 6 * it under the terms of the GNU General Public License as published by
Chris@19 7 * the Free Software Foundation; either version 2 of the License, or
Chris@19 8 * (at your option) any later version.
Chris@19 9 *
Chris@19 10 * This program is distributed in the hope that it will be useful,
Chris@19 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@19 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@19 13 * GNU General Public License for more details.
Chris@19 14 *
Chris@19 15 * You should have received a copy of the GNU General Public License
Chris@19 16 * along with this program; if not, write to the Free Software
Chris@19 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@19 18 *
Chris@19 19 */
Chris@19 20
Chris@19 21
Chris@19 22 /* FFTW internal header file */
Chris@19 23 #ifndef __IFFTW_H__
Chris@19 24 #define __IFFTW_H__
Chris@19 25
Chris@19 26 #include "config.h"
Chris@19 27
Chris@19 28 #include <stdlib.h> /* size_t */
Chris@19 29 #include <stdarg.h> /* va_list */
Chris@19 30 #include <stddef.h> /* ptrdiff_t */
Chris@19 31
Chris@19 32 #if HAVE_SYS_TYPES_H
Chris@19 33 # include <sys/types.h>
Chris@19 34 #endif
Chris@19 35
Chris@19 36 #if HAVE_STDINT_H
Chris@19 37 # include <stdint.h> /* uintptr_t, maybe */
Chris@19 38 #endif
Chris@19 39
Chris@19 40 #if HAVE_INTTYPES_H
Chris@19 41 # include <inttypes.h> /* uintptr_t, maybe */
Chris@19 42 #endif
Chris@19 43
Chris@19 44 #ifdef __cplusplus
Chris@19 45 extern "C"
Chris@19 46 {
Chris@19 47 #endif /* __cplusplus */
Chris@19 48
Chris@19 49 /* Windows annoyances -- since tests/hook.c uses some internal
Chris@19 50 FFTW functions, we need to given them the dllexport attribute
Chris@19 51 under Windows when compiling as a DLL (see api/fftw3.h). */
Chris@19 52 #if defined(FFTW_EXTERN)
Chris@19 53 # define IFFTW_EXTERN FFTW_EXTERN
Chris@19 54 #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
Chris@19 55 && (defined(_WIN32) || defined(__WIN32__))
Chris@19 56 # define IFFTW_EXTERN extern __declspec(dllexport)
Chris@19 57 #else
Chris@19 58 # define IFFTW_EXTERN extern
Chris@19 59 #endif
Chris@19 60
Chris@19 61 /* determine precision and name-mangling scheme */
Chris@19 62 #define CONCAT(prefix, name) prefix ## name
Chris@19 63 #if defined(FFTW_SINGLE)
Chris@19 64 typedef float R;
Chris@19 65 # define X(name) CONCAT(fftwf_, name)
Chris@19 66 #elif defined(FFTW_LDOUBLE)
Chris@19 67 typedef long double R;
Chris@19 68 # define X(name) CONCAT(fftwl_, name)
Chris@19 69 # define TRIGREAL_IS_LONG_DOUBLE
Chris@19 70 #elif defined(FFTW_QUAD)
Chris@19 71 typedef __float128 R;
Chris@19 72 # define X(name) CONCAT(fftwq_, name)
Chris@19 73 # define TRIGREAL_IS_QUAD
Chris@19 74 #else
Chris@19 75 typedef double R;
Chris@19 76 # define X(name) CONCAT(fftw_, name)
Chris@19 77 #endif
Chris@19 78
Chris@19 79 /*
Chris@19 80 integral type large enough to contain a stride (what ``int'' should
Chris@19 81 have been in the first place.
Chris@19 82 */
Chris@19 83 typedef ptrdiff_t INT;
Chris@19 84
Chris@19 85 /* dummy use of unused parameters to silence compiler warnings */
Chris@19 86 #define UNUSED(x) (void)x
Chris@19 87
Chris@19 88 #define NELEM(array) ((int) (sizeof(array) / sizeof((array)[0])))
Chris@19 89
Chris@19 90 #define FFT_SIGN (-1) /* sign convention for forward transforms */
Chris@19 91 extern void X(extract_reim)(int sign, R *c, R **r, R **i);
Chris@19 92
Chris@19 93 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
Chris@19 94
Chris@19 95 #define STRINGIZEx(x) #x
Chris@19 96 #define STRINGIZE(x) STRINGIZEx(x)
Chris@19 97 #define CIMPLIES(ante, post) (!(ante) || (post))
Chris@19 98
Chris@19 99 /* define HAVE_SIMD if any simd extensions are supported */
Chris@19 100 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || defined(HAVE_ALTIVEC) || \
Chris@19 101 defined(HAVE_MIPS_PS) || defined(HAVE_AVX)
Chris@19 102 #define HAVE_SIMD 1
Chris@19 103 #else
Chris@19 104 #define HAVE_SIMD 0
Chris@19 105 #endif
Chris@19 106
Chris@19 107 extern int X(have_simd_sse2)(void);
Chris@19 108 extern int X(have_simd_avx)(void);
Chris@19 109 extern int X(have_simd_altivec)(void);
Chris@19 110 extern int X(have_simd_neon)(void);
Chris@19 111
Chris@19 112 /* forward declarations */
Chris@19 113 typedef struct problem_s problem;
Chris@19 114 typedef struct plan_s plan;
Chris@19 115 typedef struct solver_s solver;
Chris@19 116 typedef struct planner_s planner;
Chris@19 117 typedef struct printer_s printer;
Chris@19 118 typedef struct scanner_s scanner;
Chris@19 119
Chris@19 120 /*-----------------------------------------------------------------------*/
Chris@19 121 /* alloca: */
Chris@19 122 #if HAVE_SIMD
Chris@19 123 # ifdef HAVE_AVX
Chris@19 124 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
Chris@19 125 * everything else */
Chris@19 126 # else
Chris@19 127 /* Note that we cannot use 32-byte alignment for all SIMD. For
Chris@19 128 example, MacOS X malloc is 16-byte aligned, but there was no
Chris@19 129 posix_memalign in MacOS X until version 10.6. */
Chris@19 130 # define MIN_ALIGNMENT 16
Chris@19 131 # endif
Chris@19 132 #endif
Chris@19 133
Chris@19 134 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
Chris@19 135 /* use alloca if available */
Chris@19 136
Chris@19 137 #ifndef alloca
Chris@19 138 #ifdef __GNUC__
Chris@19 139 # define alloca __builtin_alloca
Chris@19 140 #else
Chris@19 141 # ifdef _MSC_VER
Chris@19 142 # include <malloc.h>
Chris@19 143 # define alloca _alloca
Chris@19 144 # else
Chris@19 145 # if HAVE_ALLOCA_H
Chris@19 146 # include <alloca.h>
Chris@19 147 # else
Chris@19 148 # ifdef _AIX
Chris@19 149 #pragma alloca
Chris@19 150 # else
Chris@19 151 # ifndef alloca /* predefined by HP cc +Olibcalls */
Chris@19 152 void *alloca(size_t);
Chris@19 153 # endif
Chris@19 154 # endif
Chris@19 155 # endif
Chris@19 156 # endif
Chris@19 157 #endif
Chris@19 158 #endif
Chris@19 159
Chris@19 160 # ifdef MIN_ALIGNMENT
Chris@19 161 # define STACK_MALLOC(T, p, n) \
Chris@19 162 { \
Chris@19 163 p = (T)alloca((n) + MIN_ALIGNMENT); \
Chris@19 164 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
Chris@19 165 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \
Chris@19 166 }
Chris@19 167 # define STACK_FREE(n)
Chris@19 168 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
Chris@19 169 # define STACK_MALLOC(T, p, n) p = (T)alloca(n)
Chris@19 170 # define STACK_FREE(n)
Chris@19 171 # endif
Chris@19 172
Chris@19 173 #else /* ! HAVE_ALLOCA */
Chris@19 174 /* use malloc instead of alloca */
Chris@19 175 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
Chris@19 176 # define STACK_FREE(n) X(ifree)(n)
Chris@19 177 #endif /* ! HAVE_ALLOCA */
Chris@19 178
Chris@19 179 /* allocation of buffers. If these grow too large use malloc(), else
Chris@19 180 use STACK_MALLOC (hopefully reducing to alloca()). */
Chris@19 181
Chris@19 182 /* 64KiB ought to be enough for anybody */
Chris@19 183 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
Chris@19 184
Chris@19 185 #define BUF_ALLOC(T, p, n) \
Chris@19 186 { \
Chris@19 187 if (n < MAX_STACK_ALLOC) { \
Chris@19 188 STACK_MALLOC(T, p, n); \
Chris@19 189 } else { \
Chris@19 190 p = (T)MALLOC(n, BUFFERS); \
Chris@19 191 } \
Chris@19 192 }
Chris@19 193
Chris@19 194 #define BUF_FREE(p, n) \
Chris@19 195 { \
Chris@19 196 if (n < MAX_STACK_ALLOC) { \
Chris@19 197 STACK_FREE(p); \
Chris@19 198 } else { \
Chris@19 199 X(ifree)(p); \
Chris@19 200 } \
Chris@19 201 }
Chris@19 202
Chris@19 203 /*-----------------------------------------------------------------------*/
Chris@19 204 /* define uintptr_t if it is not already defined */
Chris@19 205
Chris@19 206 #ifndef HAVE_UINTPTR_T
Chris@19 207 # if SIZEOF_VOID_P == 0
Chris@19 208 # error sizeof void* is unknown!
Chris@19 209 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
Chris@19 210 typedef unsigned int uintptr_t;
Chris@19 211 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
Chris@19 212 typedef unsigned long uintptr_t;
Chris@19 213 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
Chris@19 214 typedef unsigned long long uintptr_t;
Chris@19 215 # else
Chris@19 216 # error no unsigned integer type matches void* sizeof!
Chris@19 217 # endif
Chris@19 218 #endif
Chris@19 219
Chris@19 220 /*-----------------------------------------------------------------------*/
Chris@19 221 /* We can do an optimization for copying pairs of (aligned) floats
Chris@19 222 when in single precision if 2*float = double. */
Chris@19 223
Chris@19 224 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
Chris@19 225 && SIZEOF_FLOAT != 0 \
Chris@19 226 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
Chris@19 227
Chris@19 228 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
Chris@19 229
Chris@19 230 /*-----------------------------------------------------------------------*/
Chris@19 231 /* assert.c: */
Chris@19 232 IFFTW_EXTERN void X(assertion_failed)(const char *s,
Chris@19 233 int line, const char *file);
Chris@19 234
Chris@19 235 /* always check */
Chris@19 236 #define CK(ex) \
Chris@19 237 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@19 238
Chris@19 239 #ifdef FFTW_DEBUG
Chris@19 240 /* check only if debug enabled */
Chris@19 241 #define A(ex) \
Chris@19 242 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@19 243 #else
Chris@19 244 #define A(ex) /* nothing */
Chris@19 245 #endif
Chris@19 246
Chris@19 247 extern void X(debug)(const char *format, ...);
Chris@19 248 #define D X(debug)
Chris@19 249
Chris@19 250 /*-----------------------------------------------------------------------*/
Chris@19 251 /* kalloc.c: */
Chris@19 252 extern void *X(kernel_malloc)(size_t n);
Chris@19 253 extern void X(kernel_free)(void *p);
Chris@19 254
Chris@19 255 /*-----------------------------------------------------------------------*/
Chris@19 256 /* alloc.c: */
Chris@19 257
Chris@19 258 /* objects allocated by malloc, for statistical purposes */
Chris@19 259 enum malloc_tag {
Chris@19 260 EVERYTHING,
Chris@19 261 PLANS,
Chris@19 262 SOLVERS,
Chris@19 263 PROBLEMS,
Chris@19 264 BUFFERS,
Chris@19 265 HASHT,
Chris@19 266 TENSORS,
Chris@19 267 PLANNERS,
Chris@19 268 SLVDESCS,
Chris@19 269 TWIDDLES,
Chris@19 270 STRIDES,
Chris@19 271 OTHER,
Chris@19 272 MALLOC_WHAT_LAST /* must be last */
Chris@19 273 };
Chris@19 274
Chris@19 275 IFFTW_EXTERN void X(ifree)(void *ptr);
Chris@19 276 extern void X(ifree0)(void *ptr);
Chris@19 277
Chris@19 278 #ifdef FFTW_DEBUG_MALLOC
Chris@19 279
Chris@19 280 IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what,
Chris@19 281 const char *file, int line);
Chris@19 282 #define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
Chris@19 283 IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose);
Chris@19 284
Chris@19 285 #else /* ! FFTW_DEBUG_MALLOC */
Chris@19 286
Chris@19 287 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
Chris@19 288 #define MALLOC(n, what) X(malloc_plain)(n)
Chris@19 289
Chris@19 290 #endif
Chris@19 291
Chris@19 292 #if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) || defined(HAVE_OPENMP))
Chris@19 293 extern int X(in_thread);
Chris@19 294 # define IN_THREAD X(in_thread)
Chris@19 295 # define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
Chris@19 296 # define THREAD_OFF X(in_thread) = in_thread_save; }
Chris@19 297 #else
Chris@19 298 # define IN_THREAD 0
Chris@19 299 # define THREAD_ON
Chris@19 300 # define THREAD_OFF
Chris@19 301 #endif
Chris@19 302
Chris@19 303 /*-----------------------------------------------------------------------*/
Chris@19 304 /* low-resolution clock */
Chris@19 305
Chris@19 306 #ifdef FAKE_CRUDE_TIME
Chris@19 307 typedef int crude_time;
Chris@19 308 #else
Chris@19 309 # if TIME_WITH_SYS_TIME
Chris@19 310 # include <sys/time.h>
Chris@19 311 # include <time.h>
Chris@19 312 # else
Chris@19 313 # if HAVE_SYS_TIME_H
Chris@19 314 # include <sys/time.h>
Chris@19 315 # else
Chris@19 316 # include <time.h>
Chris@19 317 # endif
Chris@19 318 # endif
Chris@19 319
Chris@19 320 # ifdef HAVE_BSDGETTIMEOFDAY
Chris@19 321 # ifndef HAVE_GETTIMEOFDAY
Chris@19 322 # define gettimeofday BSDgettimeofday
Chris@19 323 # define HAVE_GETTIMEOFDAY 1
Chris@19 324 # endif
Chris@19 325 # endif
Chris@19 326
Chris@19 327 # if defined(HAVE_GETTIMEOFDAY)
Chris@19 328 typedef struct timeval crude_time;
Chris@19 329 # else
Chris@19 330 typedef clock_t crude_time;
Chris@19 331 # endif
Chris@19 332 #endif /* else FAKE_CRUDE_TIME */
Chris@19 333
Chris@19 334 crude_time X(get_crude_time)(void);
Chris@19 335 double X(elapsed_since)(const planner *plnr, const problem *p,
Chris@19 336 crude_time t0); /* time in seconds since t0 */
Chris@19 337
Chris@19 338 /*-----------------------------------------------------------------------*/
Chris@19 339 /* ops.c: */
Chris@19 340 /*
Chris@19 341 * ops counter. The total number of additions is add + fma
Chris@19 342 * and the total number of multiplications is mul + fma.
Chris@19 343 * Total flops = add + mul + 2 * fma
Chris@19 344 */
Chris@19 345 typedef struct {
Chris@19 346 double add;
Chris@19 347 double mul;
Chris@19 348 double fma;
Chris@19 349 double other;
Chris@19 350 } opcnt;
Chris@19 351
Chris@19 352 void X(ops_zero)(opcnt *dst);
Chris@19 353 void X(ops_other)(INT o, opcnt *dst);
Chris@19 354 void X(ops_cpy)(const opcnt *src, opcnt *dst);
Chris@19 355
Chris@19 356 void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
Chris@19 357 void X(ops_add2)(const opcnt *a, opcnt *dst);
Chris@19 358
Chris@19 359 /* dst = m * a + b */
Chris@19 360 void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
Chris@19 361
Chris@19 362 /* dst += m * a */
Chris@19 363 void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
Chris@19 364
Chris@19 365
Chris@19 366 /*-----------------------------------------------------------------------*/
Chris@19 367 /* minmax.c: */
Chris@19 368 INT X(imax)(INT a, INT b);
Chris@19 369 INT X(imin)(INT a, INT b);
Chris@19 370
Chris@19 371 /*-----------------------------------------------------------------------*/
Chris@19 372 /* iabs.c: */
Chris@19 373 INT X(iabs)(INT a);
Chris@19 374
Chris@19 375 /* inline version */
Chris@19 376 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
Chris@19 377
Chris@19 378 /*-----------------------------------------------------------------------*/
Chris@19 379 /* md5.c */
Chris@19 380
Chris@19 381 #if SIZEOF_UNSIGNED_INT >= 4
Chris@19 382 typedef unsigned int md5uint;
Chris@19 383 #else
Chris@19 384 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
Chris@19 385 #endif
Chris@19 386
Chris@19 387 typedef md5uint md5sig[4];
Chris@19 388
Chris@19 389 typedef struct {
Chris@19 390 md5sig s; /* state and signature */
Chris@19 391
Chris@19 392 /* fields not meant to be used outside md5.c: */
Chris@19 393 unsigned char c[64]; /* stuff not yet processed */
Chris@19 394 unsigned l; /* total length. Should be 64 bits long, but this is
Chris@19 395 good enough for us */
Chris@19 396 } md5;
Chris@19 397
Chris@19 398 void X(md5begin)(md5 *p);
Chris@19 399 void X(md5putb)(md5 *p, const void *d_, size_t len);
Chris@19 400 void X(md5puts)(md5 *p, const char *s);
Chris@19 401 void X(md5putc)(md5 *p, unsigned char c);
Chris@19 402 void X(md5int)(md5 *p, int i);
Chris@19 403 void X(md5INT)(md5 *p, INT i);
Chris@19 404 void X(md5unsigned)(md5 *p, unsigned i);
Chris@19 405 void X(md5end)(md5 *p);
Chris@19 406
Chris@19 407 /*-----------------------------------------------------------------------*/
Chris@19 408 /* tensor.c: */
Chris@19 409 #define STRUCT_HACK_KR
Chris@19 410 #undef STRUCT_HACK_C99
Chris@19 411
Chris@19 412 typedef struct {
Chris@19 413 INT n;
Chris@19 414 INT is; /* input stride */
Chris@19 415 INT os; /* output stride */
Chris@19 416 } iodim;
Chris@19 417
Chris@19 418 typedef struct {
Chris@19 419 int rnk;
Chris@19 420 #if defined(STRUCT_HACK_KR)
Chris@19 421 iodim dims[1];
Chris@19 422 #elif defined(STRUCT_HACK_C99)
Chris@19 423 iodim dims[];
Chris@19 424 #else
Chris@19 425 iodim *dims;
Chris@19 426 #endif
Chris@19 427 } tensor;
Chris@19 428
Chris@19 429 /*
Chris@19 430 Definition of rank -infinity.
Chris@19 431 This definition has the property that if you want rank 0 or 1,
Chris@19 432 you can simply test for rank <= 1. This is a common case.
Chris@19 433
Chris@19 434 A tensor of rank -infinity has size 0.
Chris@19 435 */
Chris@19 436 #define RNK_MINFTY ((int)(((unsigned) -1) >> 1))
Chris@19 437 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
Chris@19 438
Chris@19 439 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
Chris@19 440
Chris@19 441 tensor *X(mktensor)(int rnk);
Chris@19 442 tensor *X(mktensor_0d)(void);
Chris@19 443 tensor *X(mktensor_1d)(INT n, INT is, INT os);
Chris@19 444 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
Chris@19 445 INT n1, INT is1, INT os1);
Chris@19 446 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
Chris@19 447 INT n1, INT is1, INT os1,
Chris@19 448 INT n2, INT is2, INT os2);
Chris@19 449 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
Chris@19 450 INT n1, INT is1, INT os1,
Chris@19 451 INT n2, INT is2, INT os2,
Chris@19 452 INT n3, INT is3, INT os3);
Chris@19 453 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
Chris@19 454 INT n1, INT is1, INT os1,
Chris@19 455 INT n2, INT is2, INT os2,
Chris@19 456 INT n3, INT is3, INT os3,
Chris@19 457 INT n4, INT is4, INT os4);
Chris@19 458 INT X(tensor_sz)(const tensor *sz);
Chris@19 459 void X(tensor_md5)(md5 *p, const tensor *t);
Chris@19 460 INT X(tensor_max_index)(const tensor *sz);
Chris@19 461 INT X(tensor_min_istride)(const tensor *sz);
Chris@19 462 INT X(tensor_min_ostride)(const tensor *sz);
Chris@19 463 INT X(tensor_min_stride)(const tensor *sz);
Chris@19 464 int X(tensor_inplace_strides)(const tensor *sz);
Chris@19 465 int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
Chris@19 466 int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
Chris@19 467 inplace_kind k);
Chris@19 468 tensor *X(tensor_copy)(const tensor *sz);
Chris@19 469 int X(tensor_kosherp)(const tensor *x);
Chris@19 470
Chris@19 471 tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
Chris@19 472 tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
Chris@19 473 tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
Chris@19 474 tensor *X(tensor_compress)(const tensor *sz);
Chris@19 475 tensor *X(tensor_compress_contiguous)(const tensor *sz);
Chris@19 476 tensor *X(tensor_append)(const tensor *a, const tensor *b);
Chris@19 477 void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
Chris@19 478 int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
Chris@19 479 void X(tensor_destroy)(tensor *sz);
Chris@19 480 void X(tensor_destroy2)(tensor *a, tensor *b);
Chris@19 481 void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
Chris@19 482 void X(tensor_print)(const tensor *sz, printer *p);
Chris@19 483 int X(dimcmp)(const iodim *a, const iodim *b);
Chris@19 484 int X(tensor_equal)(const tensor *a, const tensor *b);
Chris@19 485 int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
Chris@19 486
Chris@19 487 /*-----------------------------------------------------------------------*/
Chris@19 488 /* problem.c: */
Chris@19 489 enum {
Chris@19 490 /* a problem that cannot be solved */
Chris@19 491 PROBLEM_UNSOLVABLE,
Chris@19 492
Chris@19 493 PROBLEM_DFT,
Chris@19 494 PROBLEM_RDFT,
Chris@19 495 PROBLEM_RDFT2,
Chris@19 496
Chris@19 497 /* for mpi/ subdirectory */
Chris@19 498 PROBLEM_MPI_DFT,
Chris@19 499 PROBLEM_MPI_RDFT,
Chris@19 500 PROBLEM_MPI_RDFT2,
Chris@19 501 PROBLEM_MPI_TRANSPOSE,
Chris@19 502
Chris@19 503 PROBLEM_LAST
Chris@19 504 };
Chris@19 505
Chris@19 506 typedef struct {
Chris@19 507 int problem_kind;
Chris@19 508 void (*hash) (const problem *ego, md5 *p);
Chris@19 509 void (*zero) (const problem *ego);
Chris@19 510 void (*print) (const problem *ego, printer *p);
Chris@19 511 void (*destroy) (problem *ego);
Chris@19 512 } problem_adt;
Chris@19 513
Chris@19 514 struct problem_s {
Chris@19 515 const problem_adt *adt;
Chris@19 516 };
Chris@19 517
Chris@19 518 problem *X(mkproblem)(size_t sz, const problem_adt *adt);
Chris@19 519 void X(problem_destroy)(problem *ego);
Chris@19 520 problem *X(mkproblem_unsolvable)(void);
Chris@19 521
Chris@19 522 /*-----------------------------------------------------------------------*/
Chris@19 523 /* print.c */
Chris@19 524 struct printer_s {
Chris@19 525 void (*print)(printer *p, const char *format, ...);
Chris@19 526 void (*vprint)(printer *p, const char *format, va_list ap);
Chris@19 527 void (*putchr)(printer *p, char c);
Chris@19 528 void (*cleanup)(printer *p);
Chris@19 529 int indent;
Chris@19 530 int indent_incr;
Chris@19 531 };
Chris@19 532
Chris@19 533 printer *X(mkprinter)(size_t size,
Chris@19 534 void (*putchr)(printer *p, char c),
Chris@19 535 void (*cleanup)(printer *p));
Chris@19 536 IFFTW_EXTERN void X(printer_destroy)(printer *p);
Chris@19 537
Chris@19 538 /*-----------------------------------------------------------------------*/
Chris@19 539 /* scan.c */
Chris@19 540 struct scanner_s {
Chris@19 541 int (*scan)(scanner *sc, const char *format, ...);
Chris@19 542 int (*vscan)(scanner *sc, const char *format, va_list ap);
Chris@19 543 int (*getchr)(scanner *sc);
Chris@19 544 int ungotc;
Chris@19 545 };
Chris@19 546
Chris@19 547 scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
Chris@19 548 void X(scanner_destroy)(scanner *sc);
Chris@19 549
Chris@19 550 /*-----------------------------------------------------------------------*/
Chris@19 551 /* plan.c: */
Chris@19 552
Chris@19 553 enum wakefulness {
Chris@19 554 SLEEPY,
Chris@19 555 AWAKE_ZERO,
Chris@19 556 AWAKE_SQRTN_TABLE,
Chris@19 557 AWAKE_SINCOS
Chris@19 558 };
Chris@19 559
Chris@19 560 typedef struct {
Chris@19 561 void (*solve)(const plan *ego, const problem *p);
Chris@19 562 void (*awake)(plan *ego, enum wakefulness wakefulness);
Chris@19 563 void (*print)(const plan *ego, printer *p);
Chris@19 564 void (*destroy)(plan *ego);
Chris@19 565 } plan_adt;
Chris@19 566
Chris@19 567 struct plan_s {
Chris@19 568 const plan_adt *adt;
Chris@19 569 opcnt ops;
Chris@19 570 double pcost;
Chris@19 571 enum wakefulness wakefulness; /* used for debugging only */
Chris@19 572 int could_prune_now_p;
Chris@19 573 };
Chris@19 574
Chris@19 575 plan *X(mkplan)(size_t size, const plan_adt *adt);
Chris@19 576 void X(plan_destroy_internal)(plan *ego);
Chris@19 577 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
Chris@19 578 void X(plan_null_destroy)(plan *ego);
Chris@19 579
Chris@19 580 /*-----------------------------------------------------------------------*/
Chris@19 581 /* solver.c: */
Chris@19 582 typedef struct {
Chris@19 583 int problem_kind;
Chris@19 584 plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
Chris@19 585 void (*destroy)(solver *ego);
Chris@19 586 } solver_adt;
Chris@19 587
Chris@19 588 struct solver_s {
Chris@19 589 const solver_adt *adt;
Chris@19 590 int refcnt;
Chris@19 591 };
Chris@19 592
Chris@19 593 solver *X(mksolver)(size_t size, const solver_adt *adt);
Chris@19 594 void X(solver_use)(solver *ego);
Chris@19 595 void X(solver_destroy)(solver *ego);
Chris@19 596 void X(solver_register)(planner *plnr, solver *s);
Chris@19 597
Chris@19 598 /* shorthand */
Chris@19 599 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
Chris@19 600
Chris@19 601 /*-----------------------------------------------------------------------*/
Chris@19 602 /* planner.c */
Chris@19 603
Chris@19 604 typedef struct slvdesc_s {
Chris@19 605 solver *slv;
Chris@19 606 const char *reg_nam;
Chris@19 607 unsigned nam_hash;
Chris@19 608 int reg_id;
Chris@19 609 int next_for_same_problem_kind;
Chris@19 610 } slvdesc;
Chris@19 611
Chris@19 612 typedef struct solution_s solution; /* opaque */
Chris@19 613
Chris@19 614 /* interpretation of L and U:
Chris@19 615
Chris@19 616 - if it returns a plan, the planner guarantees that all applicable
Chris@19 617 plans at least as impatient as U have been tried, and that each
Chris@19 618 plan in the solution is at least as impatient as L.
Chris@19 619
Chris@19 620 - if it returns 0, the planner guarantees to have tried all solvers
Chris@19 621 at least as impatient as L, and that none of them was applicable.
Chris@19 622
Chris@19 623 The structure is packed to fit into 64 bits.
Chris@19 624 */
Chris@19 625
Chris@19 626 typedef struct {
Chris@19 627 unsigned l:20;
Chris@19 628 unsigned hash_info:3;
Chris@19 629 # define BITS_FOR_TIMELIMIT 9
Chris@19 630 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
Chris@19 631 unsigned u:20;
Chris@19 632
Chris@19 633 /* abstraction break: we store the solver here to pad the
Chris@19 634 structure to 64 bits. Otherwise, the struct is padded to 64
Chris@19 635 bits anyway, and another word is allocated for slvndx. */
Chris@19 636 # define BITS_FOR_SLVNDX 12
Chris@19 637 unsigned slvndx:BITS_FOR_SLVNDX;
Chris@19 638 } flags_t;
Chris@19 639
Chris@19 640 /* impatience flags */
Chris@19 641 enum {
Chris@19 642 BELIEVE_PCOST = 0x0001,
Chris@19 643 ESTIMATE = 0x0002,
Chris@19 644 NO_DFT_R2HC = 0x0004,
Chris@19 645 NO_SLOW = 0x0008,
Chris@19 646 NO_VRECURSE = 0x0010,
Chris@19 647 NO_INDIRECT_OP = 0x0020,
Chris@19 648 NO_LARGE_GENERIC = 0x0040,
Chris@19 649 NO_RANK_SPLITS = 0x0080,
Chris@19 650 NO_VRANK_SPLITS = 0x0100,
Chris@19 651 NO_NONTHREADED = 0x0200,
Chris@19 652 NO_BUFFERING = 0x0400,
Chris@19 653 NO_FIXED_RADIX_LARGE_N = 0x0800,
Chris@19 654 NO_DESTROY_INPUT = 0x1000,
Chris@19 655 NO_SIMD = 0x2000,
Chris@19 656 CONSERVE_MEMORY = 0x4000,
Chris@19 657 NO_DHT_R2HC = 0x8000,
Chris@19 658 NO_UGLY = 0x10000,
Chris@19 659 ALLOW_PRUNING = 0x20000
Chris@19 660 };
Chris@19 661
Chris@19 662 /* hashtable information */
Chris@19 663 enum {
Chris@19 664 BLESSING = 0x1, /* save this entry */
Chris@19 665 H_VALID = 0x2, /* valid hastable entry */
Chris@19 666 H_LIVE = 0x4 /* entry is nonempty, implies H_VALID */
Chris@19 667 };
Chris@19 668
Chris@19 669 #define PLNR_L(plnr) ((plnr)->flags.l)
Chris@19 670 #define PLNR_U(plnr) ((plnr)->flags.u)
Chris@19 671 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
Chris@19 672
Chris@19 673 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
Chris@19 674 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
Chris@19 675 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
Chris@19 676
Chris@19 677 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
Chris@19 678 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
Chris@19 679 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
Chris@19 680 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
Chris@19 681 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
Chris@19 682 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
Chris@19 683 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
Chris@19 684 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
Chris@19 685 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
Chris@19 686 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
Chris@19 687 #define NO_NONTHREADEDP(plnr) \
Chris@19 688 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
Chris@19 689
Chris@19 690 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
Chris@19 691 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
Chris@19 692 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
Chris@19 693 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
Chris@19 694 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
Chris@19 695
Chris@19 696 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
Chris@19 697
Chris@19 698 typedef enum {
Chris@19 699 /* WISDOM_NORMAL: planner may or may not use wisdom */
Chris@19 700 WISDOM_NORMAL,
Chris@19 701
Chris@19 702 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
Chris@19 703 WISDOM_ONLY,
Chris@19 704
Chris@19 705 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
Chris@19 706 WISDOM_IS_BOGUS,
Chris@19 707
Chris@19 708 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
Chris@19 709 WISDOM_IGNORE_INFEASIBLE,
Chris@19 710
Chris@19 711 /* WISDOM_IGNORE_ALL: planner ignores all */
Chris@19 712 WISDOM_IGNORE_ALL
Chris@19 713 } wisdom_state_t;
Chris@19 714
Chris@19 715 typedef struct {
Chris@19 716 void (*register_solver)(planner *ego, solver *s);
Chris@19 717 plan *(*mkplan)(planner *ego, const problem *p);
Chris@19 718 void (*forget)(planner *ego, amnesia a);
Chris@19 719 void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
Chris@19 720 word in C++. */
Chris@19 721 int (*imprt)(planner *ego, scanner *sc);
Chris@19 722 } planner_adt;
Chris@19 723
Chris@19 724 /* hash table of solutions */
Chris@19 725 typedef struct {
Chris@19 726 solution *solutions;
Chris@19 727 unsigned hashsiz, nelem;
Chris@19 728
Chris@19 729 /* statistics */
Chris@19 730 int lookup, succ_lookup, lookup_iter;
Chris@19 731 int insert, insert_iter, insert_unknown;
Chris@19 732 int nrehash;
Chris@19 733 } hashtab;
Chris@19 734
Chris@19 735 typedef enum { COST_SUM, COST_MAX } cost_kind;
Chris@19 736
Chris@19 737 struct planner_s {
Chris@19 738 const planner_adt *adt;
Chris@19 739 void (*hook)(struct planner_s *plnr, plan *pln,
Chris@19 740 const problem *p, int optimalp);
Chris@19 741 double (*cost_hook)(const problem *p, double t, cost_kind k);
Chris@19 742 int (*wisdom_ok_hook)(const problem *p, flags_t flags);
Chris@19 743 void (*nowisdom_hook)(const problem *p);
Chris@19 744 wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
Chris@19 745
Chris@19 746 /* solver descriptors */
Chris@19 747 slvdesc *slvdescs;
Chris@19 748 unsigned nslvdesc, slvdescsiz;
Chris@19 749 const char *cur_reg_nam;
Chris@19 750 int cur_reg_id;
Chris@19 751 int slvdescs_for_problem_kind[PROBLEM_LAST];
Chris@19 752
Chris@19 753 wisdom_state_t wisdom_state;
Chris@19 754
Chris@19 755 hashtab htab_blessed;
Chris@19 756 hashtab htab_unblessed;
Chris@19 757
Chris@19 758 int nthr;
Chris@19 759 flags_t flags;
Chris@19 760
Chris@19 761 crude_time start_time;
Chris@19 762 double timelimit; /* elapsed_since(start_time) at which to bail out */
Chris@19 763 int timed_out; /* whether most recent search timed out */
Chris@19 764 int need_timeout_check;
Chris@19 765
Chris@19 766 /* various statistics */
Chris@19 767 int nplan; /* number of plans evaluated */
Chris@19 768 double pcost, epcost; /* total pcost of measured/estimated plans */
Chris@19 769 int nprob; /* number of problems evaluated */
Chris@19 770 };
Chris@19 771
Chris@19 772 planner *X(mkplanner)(void);
Chris@19 773 void X(planner_destroy)(planner *ego);
Chris@19 774
Chris@19 775 /*
Chris@19 776 Iterate over all solvers. Read:
Chris@19 777
Chris@19 778 @article{ baker93iterators,
Chris@19 779 author = "Henry G. Baker, Jr.",
Chris@19 780 title = "Iterators: Signs of Weakness in Object-Oriented Languages",
Chris@19 781 journal = "{ACM} {OOPS} Messenger",
Chris@19 782 volume = "4",
Chris@19 783 number = "3",
Chris@19 784 pages = "18--25"
Chris@19 785 }
Chris@19 786 */
Chris@19 787 #define FORALL_SOLVERS(ego, s, p, what) \
Chris@19 788 { \
Chris@19 789 unsigned _cnt; \
Chris@19 790 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
Chris@19 791 slvdesc *p = ego->slvdescs + _cnt; \
Chris@19 792 solver *s = p->slv; \
Chris@19 793 what; \
Chris@19 794 } \
Chris@19 795 }
Chris@19 796
Chris@19 797 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
Chris@19 798 { \
Chris@19 799 int _cnt = ego->slvdescs_for_problem_kind[kind]; \
Chris@19 800 while (_cnt >= 0) { \
Chris@19 801 slvdesc *p = ego->slvdescs + _cnt; \
Chris@19 802 solver *s = p->slv; \
Chris@19 803 what; \
Chris@19 804 _cnt = p->next_for_same_problem_kind; \
Chris@19 805 } \
Chris@19 806 }
Chris@19 807
Chris@19 808
Chris@19 809 /* make plan, destroy problem */
Chris@19 810 plan *X(mkplan_d)(planner *ego, problem *p);
Chris@19 811 plan *X(mkplan_f_d)(planner *ego, problem *p,
Chris@19 812 unsigned l_set, unsigned u_set, unsigned u_reset);
Chris@19 813
Chris@19 814 /*-----------------------------------------------------------------------*/
Chris@19 815 /* stride.c: */
Chris@19 816
Chris@19 817 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
Chris@19 818 #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
Chris@19 819 #define PRECOMPUTE_ARRAY_INDICES
Chris@19 820 #endif
Chris@19 821
Chris@19 822 extern const INT X(an_INT_guaranteed_to_be_zero);
Chris@19 823
Chris@19 824 #ifdef PRECOMPUTE_ARRAY_INDICES
Chris@19 825 typedef INT *stride;
Chris@19 826 #define WS(stride, i) (stride[i])
Chris@19 827 extern stride X(mkstride)(INT n, INT s);
Chris@19 828 void X(stride_destroy)(stride p);
Chris@19 829 /* hackery to prevent the compiler from copying the strides array
Chris@19 830 onto the stack */
Chris@19 831 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
Chris@19 832 #else
Chris@19 833
Chris@19 834 typedef INT stride;
Chris@19 835 #define WS(stride, i) (stride * i)
Chris@19 836 #define fftwf_mkstride(n, stride) stride
Chris@19 837 #define fftw_mkstride(n, stride) stride
Chris@19 838 #define fftwl_mkstride(n, stride) stride
Chris@19 839 #define fftwf_stride_destroy(p) ((void) p)
Chris@19 840 #define fftw_stride_destroy(p) ((void) p)
Chris@19 841 #define fftwl_stride_destroy(p) ((void) p)
Chris@19 842
Chris@19 843 /* hackery to prevent the compiler from ``optimizing'' induction
Chris@19 844 variables in codelet loops. The problem is that for each K and for
Chris@19 845 each expression of the form P[I + STRIDE * K] in a loop, most
Chris@19 846 compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
Chris@19 847 For large values of K this behavior overflows the
Chris@19 848 register set, which is likely worse than doing the index computation
Chris@19 849 in the first place.
Chris@19 850
Chris@19 851 If we guess that there are more than
Chris@19 852 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
Chris@19 853 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
Chris@19 854 be 0, but the compiler does not know this.
Chris@19 855
Chris@19 856 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
Chris@19 857 seem to imply.
Chris@19 858 */
Chris@19 859 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
Chris@19 860 #define MAKE_VOLATILE_STRIDE(nptr, x) \
Chris@19 861 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
Chris@19 862 0 : \
Chris@19 863 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
Chris@19 864 #endif /* PRECOMPUTE_ARRAY_INDICES */
Chris@19 865
Chris@19 866 /*-----------------------------------------------------------------------*/
Chris@19 867 /* solvtab.c */
Chris@19 868
Chris@19 869 struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
Chris@19 870 typedef struct solvtab_s solvtab[];
Chris@19 871 void X(solvtab_exec)(const solvtab tbl, planner *p);
Chris@19 872 #define SOLVTAB(s) { s, STRINGIZE(s) }
Chris@19 873 #define SOLVTAB_END { 0, 0 }
Chris@19 874
Chris@19 875 /*-----------------------------------------------------------------------*/
Chris@19 876 /* pickdim.c */
Chris@19 877 int X(pickdim)(int which_dim, const int *buddies, int nbuddies,
Chris@19 878 const tensor *sz, int oop, int *dp);
Chris@19 879
Chris@19 880 /*-----------------------------------------------------------------------*/
Chris@19 881 /* twiddle.c */
Chris@19 882 /* little language to express twiddle factors computation */
Chris@19 883 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
Chris@19 884 TW_FULL = 4, TW_HALF = 5 };
Chris@19 885
Chris@19 886 typedef struct {
Chris@19 887 unsigned char op;
Chris@19 888 signed char v;
Chris@19 889 short i;
Chris@19 890 } tw_instr;
Chris@19 891
Chris@19 892 typedef struct twid_s {
Chris@19 893 R *W; /* array of twiddle factors */
Chris@19 894 INT n, r, m; /* transform order, radix, # twiddle rows */
Chris@19 895 int refcnt;
Chris@19 896 const tw_instr *instr;
Chris@19 897 struct twid_s *cdr;
Chris@19 898 enum wakefulness wakefulness;
Chris@19 899 } twid;
Chris@19 900
Chris@19 901 INT X(twiddle_length)(INT r, const tw_instr *p);
Chris@19 902 void X(twiddle_awake)(enum wakefulness wakefulness,
Chris@19 903 twid **pp, const tw_instr *instr, INT n, INT r, INT m);
Chris@19 904
Chris@19 905 /*-----------------------------------------------------------------------*/
Chris@19 906 /* trig.c */
Chris@19 907 #if defined(TRIGREAL_IS_LONG_DOUBLE)
Chris@19 908 typedef long double trigreal;
Chris@19 909 #elif defined(TRIGREAL_IS_QUAD)
Chris@19 910 typedef __float128 trigreal;
Chris@19 911 #else
Chris@19 912 typedef double trigreal;
Chris@19 913 #endif
Chris@19 914
Chris@19 915 typedef struct triggen_s triggen;
Chris@19 916
Chris@19 917 struct triggen_s {
Chris@19 918 void (*cexp)(triggen *t, INT m, R *result);
Chris@19 919 void (*cexpl)(triggen *t, INT m, trigreal *result);
Chris@19 920 void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
Chris@19 921
Chris@19 922 INT twshft;
Chris@19 923 INT twradix;
Chris@19 924 INT twmsk;
Chris@19 925 trigreal *W0, *W1;
Chris@19 926 INT n;
Chris@19 927 };
Chris@19 928
Chris@19 929 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
Chris@19 930 void X(triggen_destroy)(triggen *p);
Chris@19 931
Chris@19 932 /*-----------------------------------------------------------------------*/
Chris@19 933 /* primes.c: */
Chris@19 934
Chris@19 935 #define MULMOD(x, y, p) \
Chris@19 936 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
Chris@19 937
Chris@19 938 INT X(safe_mulmod)(INT x, INT y, INT p);
Chris@19 939 INT X(power_mod)(INT n, INT m, INT p);
Chris@19 940 INT X(find_generator)(INT p);
Chris@19 941 INT X(first_divisor)(INT n);
Chris@19 942 int X(is_prime)(INT n);
Chris@19 943 INT X(next_prime)(INT n);
Chris@19 944 int X(factors_into)(INT n, const INT *primes);
Chris@19 945 int X(factors_into_small_primes)(INT n);
Chris@19 946 INT X(choose_radix)(INT r, INT n);
Chris@19 947 INT X(isqrt)(INT n);
Chris@19 948 INT X(modulo)(INT a, INT n);
Chris@19 949
Chris@19 950 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
Chris@19 951
Chris@19 952 /* thresholds below which certain solvers are considered SLOW. These are guesses
Chris@19 953 believed to be conservative */
Chris@19 954 #define GENERIC_MAX_SLOW 16
Chris@19 955 #define RADER_MAX_SLOW 32
Chris@19 956 #define BLUESTEIN_MAX_SLOW 24
Chris@19 957
Chris@19 958 /*-----------------------------------------------------------------------*/
Chris@19 959 /* rader.c: */
Chris@19 960 typedef struct rader_tls rader_tl;
Chris@19 961
Chris@19 962 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
Chris@19 963 R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
Chris@19 964 void X(rader_tl_delete)(R *W, rader_tl **tl);
Chris@19 965
Chris@19 966 /*-----------------------------------------------------------------------*/
Chris@19 967 /* copy/transposition routines */
Chris@19 968
Chris@19 969 /* lower bound to the cache size, for tiled routines */
Chris@19 970 #define CACHESIZE 8192
Chris@19 971
Chris@19 972 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
Chris@19 973
Chris@19 974 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
Chris@19 975 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
Chris@19 976 void *args);
Chris@19 977 void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
Chris@19 978 void X(cpy2d)(R *I, R *O,
Chris@19 979 INT n0, INT is0, INT os0,
Chris@19 980 INT n1, INT is1, INT os1,
Chris@19 981 INT vl);
Chris@19 982 void X(cpy2d_ci)(R *I, R *O,
Chris@19 983 INT n0, INT is0, INT os0,
Chris@19 984 INT n1, INT is1, INT os1,
Chris@19 985 INT vl);
Chris@19 986 void X(cpy2d_co)(R *I, R *O,
Chris@19 987 INT n0, INT is0, INT os0,
Chris@19 988 INT n1, INT is1, INT os1,
Chris@19 989 INT vl);
Chris@19 990 void X(cpy2d_tiled)(R *I, R *O,
Chris@19 991 INT n0, INT is0, INT os0,
Chris@19 992 INT n1, INT is1, INT os1,
Chris@19 993 INT vl);
Chris@19 994 void X(cpy2d_tiledbuf)(R *I, R *O,
Chris@19 995 INT n0, INT is0, INT os0,
Chris@19 996 INT n1, INT is1, INT os1,
Chris@19 997 INT vl);
Chris@19 998 void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
Chris@19 999 INT n0, INT is0, INT os0,
Chris@19 1000 INT n1, INT is1, INT os1);
Chris@19 1001 void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
Chris@19 1002 INT n0, INT is0, INT os0,
Chris@19 1003 INT n1, INT is1, INT os1);
Chris@19 1004 void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
Chris@19 1005 INT n0, INT is0, INT os0,
Chris@19 1006 INT n1, INT is1, INT os1);
Chris@19 1007
Chris@19 1008 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@19 1009 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@19 1010 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@19 1011
Chris@19 1012 typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@19 1013 typedef void (*cpy2d_func)(R *I, R *O,
Chris@19 1014 INT n0, INT is0, INT os0,
Chris@19 1015 INT n1, INT is1, INT os1,
Chris@19 1016 INT vl);
Chris@19 1017
Chris@19 1018 /*-----------------------------------------------------------------------*/
Chris@19 1019 /* misc stuff */
Chris@19 1020 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
Chris@19 1021 double X(iestimate_cost)(const planner *, const plan *, const problem *);
Chris@19 1022
Chris@19 1023 #ifdef FFTW_RANDOM_ESTIMATOR
Chris@19 1024 extern unsigned X(random_estimate_seed);
Chris@19 1025 #endif
Chris@19 1026
Chris@19 1027 double X(measure_execution_time)(const planner *plnr,
Chris@19 1028 plan *pln, const problem *p);
Chris@19 1029 IFFTW_EXTERN int X(alignment_of)(R *p);
Chris@19 1030 unsigned X(hash)(const char *s);
Chris@19 1031 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
Chris@19 1032 int X(nbuf_redundant)(INT n, INT vl, int which,
Chris@19 1033 const INT *maxnbuf, int nmaxnbuf);
Chris@19 1034 INT X(bufdist)(INT n, INT vl);
Chris@19 1035 int X(toobig)(INT n);
Chris@19 1036 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
Chris@19 1037
Chris@19 1038 #if HAVE_SIMD
Chris@19 1039 R *X(taint)(R *p, INT s);
Chris@19 1040 R *X(join_taint)(R *p1, R *p2);
Chris@19 1041 #define TAINT(p, s) X(taint)(p, s)
Chris@19 1042 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
Chris@19 1043 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
Chris@19 1044 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
Chris@19 1045 #else
Chris@19 1046 #define TAINT(p, s) (p)
Chris@19 1047 #define UNTAINT(p) (p)
Chris@19 1048 #define TAINTOF(p) 0
Chris@19 1049 #define JOIN_TAINT(p1, p2) p1
Chris@19 1050 #endif
Chris@19 1051
Chris@19 1052 #ifdef FFTW_DEBUG_ALIGNMENT
Chris@19 1053 # define ASSERT_ALIGNED_DOUBLE { \
Chris@19 1054 double __foo; \
Chris@19 1055 CK(!(((uintptr_t) &__foo) & 0x7)); \
Chris@19 1056 }
Chris@19 1057 #else
Chris@19 1058 # define ASSERT_ALIGNED_DOUBLE
Chris@19 1059 #endif /* FFTW_DEBUG_ALIGNMENT */
Chris@19 1060
Chris@19 1061
Chris@19 1062
Chris@19 1063 /*-----------------------------------------------------------------------*/
Chris@19 1064 /* macros used in codelets to reduce source code size */
Chris@19 1065
Chris@19 1066 typedef R E; /* internal precision of codelets. */
Chris@19 1067
Chris@19 1068 #if defined(FFTW_LDOUBLE)
Chris@19 1069 # define K(x) ((E) x##L)
Chris@19 1070 #elif defined(FFTW_QUAD)
Chris@19 1071 # define K(x) ((E) x##Q)
Chris@19 1072 #else
Chris@19 1073 # define K(x) ((E) x)
Chris@19 1074 #endif
Chris@19 1075 #define DK(name, value) const E name = K(value)
Chris@19 1076
Chris@19 1077 /* FMA macros */
Chris@19 1078
Chris@19 1079 #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
Chris@19 1080 /* The obvious expression a * b + c does not work. If both x = a * b
Chris@19 1081 + c and y = a * b - c appear in the source, gcc computes t = a * b,
Chris@19 1082 x = t + c, y = t - c, thus destroying the fma.
Chris@19 1083
Chris@19 1084 This peculiar coding seems to do the right thing on all of
Chris@19 1085 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
Chris@19 1086 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
Chris@19 1087 `x' for the single-assignment form).
Chris@19 1088
Chris@19 1089 However, gcc-4.0 is a formidable adversary which succeeds in
Chris@19 1090 pessimizing two fma's into one multiplication and two additions.
Chris@19 1091 It does it very early in the game---before the optimization passes
Chris@19 1092 even start. The only real workaround seems to use fake inline asm
Chris@19 1093 such as
Chris@19 1094
Chris@19 1095 asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
Chris@19 1096 return a * b + c;
Chris@19 1097
Chris@19 1098 in each of the FMA, FMS, FNMA, and FNMS functions. However, this
Chris@19 1099 does not solve the problem either, because two equal asm statements
Chris@19 1100 count as a common subexpression! One must use *different* fake asm
Chris@19 1101 statements:
Chris@19 1102
Chris@19 1103 in FMA:
Chris@19 1104 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
Chris@19 1105
Chris@19 1106 in FMS:
Chris@19 1107 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
Chris@19 1108
Chris@19 1109 etc.
Chris@19 1110
Chris@19 1111 After these changes, gcc recalcitrantly generates the fma that was
Chris@19 1112 in the source to begin with. However, the extra asm() cruft
Chris@19 1113 confuses other passes of gcc, notably the instruction scheduler.
Chris@19 1114 (Of course, one could also generate the fma directly via inline
Chris@19 1115 asm, but this confuses the scheduler even more.)
Chris@19 1116
Chris@19 1117 Steven and I have submitted more than one bug report to the gcc
Chris@19 1118 mailing list over the past few years, to no effect. Thus, I give
Chris@19 1119 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
Chris@19 1120 out before touching this crap again.
Chris@19 1121 */
Chris@19 1122 static __inline__ E FMA(E a, E b, E c)
Chris@19 1123 {
Chris@19 1124 E x = a * b;
Chris@19 1125 x = x + c;
Chris@19 1126 return x;
Chris@19 1127 }
Chris@19 1128
Chris@19 1129 static __inline__ E FMS(E a, E b, E c)
Chris@19 1130 {
Chris@19 1131 E x = a * b;
Chris@19 1132 x = x - c;
Chris@19 1133 return x;
Chris@19 1134 }
Chris@19 1135
Chris@19 1136 static __inline__ E FNMA(E a, E b, E c)
Chris@19 1137 {
Chris@19 1138 E x = a * b;
Chris@19 1139 x = - (x + c);
Chris@19 1140 return x;
Chris@19 1141 }
Chris@19 1142
Chris@19 1143 static __inline__ E FNMS(E a, E b, E c)
Chris@19 1144 {
Chris@19 1145 E x = a * b;
Chris@19 1146 x = - (x - c);
Chris@19 1147 return x;
Chris@19 1148 }
Chris@19 1149 #else
Chris@19 1150 #define FMA(a, b, c) (((a) * (b)) + (c))
Chris@19 1151 #define FMS(a, b, c) (((a) * (b)) - (c))
Chris@19 1152 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
Chris@19 1153 #define FNMS(a, b, c) ((c) - ((a) * (b)))
Chris@19 1154 #endif
Chris@19 1155
Chris@19 1156 #ifdef __cplusplus
Chris@19 1157 } /* extern "C" */
Chris@19 1158 #endif /* __cplusplus */
Chris@19 1159
Chris@19 1160 #endif /* __IFFTW_H__ */