annotate src/fftw-3.3.8/kernel/ifftw.h @ 169:223a55898ab9 tip default

Add null config files
author Chris Cannam <cannam@all-day-breakfast.com>
date Mon, 02 Mar 2020 14:03:47 +0000
parents bd3cc4d1df30
children
rev   line source
cannam@167 1 /*
cannam@167 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
cannam@167 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
cannam@167 4 *
cannam@167 5 * This program is free software; you can redistribute it and/or modify
cannam@167 6 * it under the terms of the GNU General Public License as published by
cannam@167 7 * the Free Software Foundation; either version 2 of the License, or
cannam@167 8 * (at your option) any later version.
cannam@167 9 *
cannam@167 10 * This program is distributed in the hope that it will be useful,
cannam@167 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
cannam@167 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
cannam@167 13 * GNU General Public License for more details.
cannam@167 14 *
cannam@167 15 * You should have received a copy of the GNU General Public License
cannam@167 16 * along with this program; if not, write to the Free Software
cannam@167 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
cannam@167 18 *
cannam@167 19 */
cannam@167 20
cannam@167 21
cannam@167 22 /* FFTW internal header file */
cannam@167 23 #ifndef __IFFTW_H__
cannam@167 24 #define __IFFTW_H__
cannam@167 25
cannam@167 26 #include "config.h"
cannam@167 27
cannam@167 28 #include <stdlib.h> /* size_t */
cannam@167 29 #include <stdarg.h> /* va_list */
cannam@167 30 #include <stddef.h> /* ptrdiff_t */
cannam@167 31 #include <limits.h> /* INT_MAX */
cannam@167 32
cannam@167 33 #if HAVE_SYS_TYPES_H
cannam@167 34 # include <sys/types.h>
cannam@167 35 #endif
cannam@167 36
cannam@167 37 #if HAVE_STDINT_H
cannam@167 38 # include <stdint.h> /* uintptr_t, maybe */
cannam@167 39 #endif
cannam@167 40
cannam@167 41 #if HAVE_INTTYPES_H
cannam@167 42 # include <inttypes.h> /* uintptr_t, maybe */
cannam@167 43 #endif
cannam@167 44
cannam@167 45 #ifdef __cplusplus
cannam@167 46 extern "C"
cannam@167 47 {
cannam@167 48 #endif /* __cplusplus */
cannam@167 49
cannam@167 50 /* Windows annoyances -- since tests/hook.c uses some internal
cannam@167 51 FFTW functions, we need to given them the dllexport attribute
cannam@167 52 under Windows when compiling as a DLL (see api/fftw3.h). */
cannam@167 53 #if defined(FFTW_EXTERN)
cannam@167 54 # define IFFTW_EXTERN FFTW_EXTERN
cannam@167 55 #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
cannam@167 56 && (defined(_WIN32) || defined(__WIN32__))
cannam@167 57 # define IFFTW_EXTERN extern __declspec(dllexport)
cannam@167 58 #else
cannam@167 59 # define IFFTW_EXTERN extern
cannam@167 60 #endif
cannam@167 61
cannam@167 62 /* determine precision and name-mangling scheme */
cannam@167 63 #define CONCAT(prefix, name) prefix ## name
cannam@167 64 #if defined(FFTW_SINGLE)
cannam@167 65 typedef float R;
cannam@167 66 # define X(name) CONCAT(fftwf_, name)
cannam@167 67 #elif defined(FFTW_LDOUBLE)
cannam@167 68 typedef long double R;
cannam@167 69 # define X(name) CONCAT(fftwl_, name)
cannam@167 70 # define TRIGREAL_IS_LONG_DOUBLE
cannam@167 71 #elif defined(FFTW_QUAD)
cannam@167 72 typedef __float128 R;
cannam@167 73 # define X(name) CONCAT(fftwq_, name)
cannam@167 74 # define TRIGREAL_IS_QUAD
cannam@167 75 #else
cannam@167 76 typedef double R;
cannam@167 77 # define X(name) CONCAT(fftw_, name)
cannam@167 78 #endif
cannam@167 79
cannam@167 80 /*
cannam@167 81 integral type large enough to contain a stride (what ``int'' should
cannam@167 82 have been in the first place.
cannam@167 83 */
cannam@167 84 typedef ptrdiff_t INT;
cannam@167 85
cannam@167 86 /* dummy use of unused parameters to silence compiler warnings */
cannam@167 87 #define UNUSED(x) (void)x
cannam@167 88
cannam@167 89 #define NELEM(array) ((sizeof(array) / sizeof((array)[0])))
cannam@167 90
cannam@167 91 #define FFT_SIGN (-1) /* sign convention for forward transforms */
cannam@167 92 extern void X(extract_reim)(int sign, R *c, R **r, R **i);
cannam@167 93
cannam@167 94 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
cannam@167 95
cannam@167 96 #define STRINGIZEx(x) #x
cannam@167 97 #define STRINGIZE(x) STRINGIZEx(x)
cannam@167 98 #define CIMPLIES(ante, post) (!(ante) || (post))
cannam@167 99
cannam@167 100 /* define HAVE_SIMD if any simd extensions are supported */
cannam@167 101 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || \
cannam@167 102 defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \
cannam@167 103 defined(HAVE_AVX2) || defined(HAVE_AVX512) || \
cannam@167 104 defined(HAVE_KCVI) || \
cannam@167 105 defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \
cannam@167 106 defined(HAVE_MIPS_PS) || \
cannam@167 107 defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256)
cannam@167 108 #define HAVE_SIMD 1
cannam@167 109 #else
cannam@167 110 #define HAVE_SIMD 0
cannam@167 111 #endif
cannam@167 112
cannam@167 113 extern int X(have_simd_sse2)(void);
cannam@167 114 extern int X(have_simd_avx)(void);
cannam@167 115 extern int X(have_simd_avx_128_fma)(void);
cannam@167 116 extern int X(have_simd_avx2)(void);
cannam@167 117 extern int X(have_simd_avx2_128)(void);
cannam@167 118 extern int X(have_simd_avx512)(void);
cannam@167 119 extern int X(have_simd_altivec)(void);
cannam@167 120 extern int X(have_simd_vsx)(void);
cannam@167 121 extern int X(have_simd_neon)(void);
cannam@167 122
cannam@167 123 /* forward declarations */
cannam@167 124 typedef struct problem_s problem;
cannam@167 125 typedef struct plan_s plan;
cannam@167 126 typedef struct solver_s solver;
cannam@167 127 typedef struct planner_s planner;
cannam@167 128 typedef struct printer_s printer;
cannam@167 129 typedef struct scanner_s scanner;
cannam@167 130
cannam@167 131 /*-----------------------------------------------------------------------*/
cannam@167 132 /* alloca: */
cannam@167 133 #if HAVE_SIMD
cannam@167 134 # if defined(HAVE_KCVI) || defined(HAVE_AVX512)
cannam@167 135 # define MIN_ALIGNMENT 64
cannam@167 136 # elif defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_GENERIC_SIMD256)
cannam@167 137 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
cannam@167 138 * everything else */
cannam@167 139 # else
cannam@167 140 /* Note that we cannot use 32-byte alignment for all SIMD. For
cannam@167 141 example, MacOS X malloc is 16-byte aligned, but there was no
cannam@167 142 posix_memalign in MacOS X until version 10.6. */
cannam@167 143 # define MIN_ALIGNMENT 16
cannam@167 144 # endif
cannam@167 145 #endif
cannam@167 146
cannam@167 147 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
cannam@167 148 /* use alloca if available */
cannam@167 149
cannam@167 150 #ifndef alloca
cannam@167 151 #ifdef __GNUC__
cannam@167 152 # define alloca __builtin_alloca
cannam@167 153 #else
cannam@167 154 # ifdef _MSC_VER
cannam@167 155 # include <malloc.h>
cannam@167 156 # define alloca _alloca
cannam@167 157 # else
cannam@167 158 # if HAVE_ALLOCA_H
cannam@167 159 # include <alloca.h>
cannam@167 160 # else
cannam@167 161 # ifdef _AIX
cannam@167 162 #pragma alloca
cannam@167 163 # else
cannam@167 164 # ifndef alloca /* predefined by HP cc +Olibcalls */
cannam@167 165 void *alloca(size_t);
cannam@167 166 # endif
cannam@167 167 # endif
cannam@167 168 # endif
cannam@167 169 # endif
cannam@167 170 #endif
cannam@167 171 #endif
cannam@167 172
cannam@167 173 # ifdef MIN_ALIGNMENT
cannam@167 174 # define STACK_MALLOC(T, p, n) \
cannam@167 175 { \
cannam@167 176 p = (T)alloca((n) + MIN_ALIGNMENT); \
cannam@167 177 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
cannam@167 178 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \
cannam@167 179 }
cannam@167 180 # define STACK_FREE(n)
cannam@167 181 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
cannam@167 182 # define STACK_MALLOC(T, p, n) p = (T)alloca(n)
cannam@167 183 # define STACK_FREE(n)
cannam@167 184 # endif
cannam@167 185
cannam@167 186 #else /* ! HAVE_ALLOCA */
cannam@167 187 /* use malloc instead of alloca */
cannam@167 188 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
cannam@167 189 # define STACK_FREE(n) X(ifree)(n)
cannam@167 190 #endif /* ! HAVE_ALLOCA */
cannam@167 191
cannam@167 192 /* allocation of buffers. If these grow too large use malloc(), else
cannam@167 193 use STACK_MALLOC (hopefully reducing to alloca()). */
cannam@167 194
cannam@167 195 /* 64KiB ought to be enough for anybody */
cannam@167 196 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
cannam@167 197
cannam@167 198 #define BUF_ALLOC(T, p, n) \
cannam@167 199 { \
cannam@167 200 if (n < MAX_STACK_ALLOC) { \
cannam@167 201 STACK_MALLOC(T, p, n); \
cannam@167 202 } else { \
cannam@167 203 p = (T)MALLOC(n, BUFFERS); \
cannam@167 204 } \
cannam@167 205 }
cannam@167 206
cannam@167 207 #define BUF_FREE(p, n) \
cannam@167 208 { \
cannam@167 209 if (n < MAX_STACK_ALLOC) { \
cannam@167 210 STACK_FREE(p); \
cannam@167 211 } else { \
cannam@167 212 X(ifree)(p); \
cannam@167 213 } \
cannam@167 214 }
cannam@167 215
cannam@167 216 /*-----------------------------------------------------------------------*/
cannam@167 217 /* define uintptr_t if it is not already defined */
cannam@167 218
cannam@167 219 #ifndef HAVE_UINTPTR_T
cannam@167 220 # if SIZEOF_VOID_P == 0
cannam@167 221 # error sizeof void* is unknown!
cannam@167 222 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
cannam@167 223 typedef unsigned int uintptr_t;
cannam@167 224 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
cannam@167 225 typedef unsigned long uintptr_t;
cannam@167 226 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
cannam@167 227 typedef unsigned long long uintptr_t;
cannam@167 228 # else
cannam@167 229 # error no unsigned integer type matches void* sizeof!
cannam@167 230 # endif
cannam@167 231 #endif
cannam@167 232
cannam@167 233 /*-----------------------------------------------------------------------*/
cannam@167 234 /* We can do an optimization for copying pairs of (aligned) floats
cannam@167 235 when in single precision if 2*float = double. */
cannam@167 236
cannam@167 237 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
cannam@167 238 && SIZEOF_FLOAT != 0 \
cannam@167 239 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
cannam@167 240
cannam@167 241 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
cannam@167 242
cannam@167 243 /*-----------------------------------------------------------------------*/
cannam@167 244 /* assert.c: */
cannam@167 245 IFFTW_EXTERN void X(assertion_failed)(const char *s,
cannam@167 246 int line, const char *file);
cannam@167 247
cannam@167 248 /* always check */
cannam@167 249 #define CK(ex) \
cannam@167 250 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
cannam@167 251
cannam@167 252 #ifdef FFTW_DEBUG
cannam@167 253 /* check only if debug enabled */
cannam@167 254 #define A(ex) \
cannam@167 255 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
cannam@167 256 #else
cannam@167 257 #define A(ex) /* nothing */
cannam@167 258 #endif
cannam@167 259
cannam@167 260 extern void X(debug)(const char *format, ...);
cannam@167 261 #define D X(debug)
cannam@167 262
cannam@167 263 /*-----------------------------------------------------------------------*/
cannam@167 264 /* kalloc.c: */
cannam@167 265 extern void *X(kernel_malloc)(size_t n);
cannam@167 266 extern void X(kernel_free)(void *p);
cannam@167 267
cannam@167 268 /*-----------------------------------------------------------------------*/
cannam@167 269 /* alloc.c: */
cannam@167 270
cannam@167 271 /* objects allocated by malloc, for statistical purposes */
cannam@167 272 enum malloc_tag {
cannam@167 273 EVERYTHING,
cannam@167 274 PLANS,
cannam@167 275 SOLVERS,
cannam@167 276 PROBLEMS,
cannam@167 277 BUFFERS,
cannam@167 278 HASHT,
cannam@167 279 TENSORS,
cannam@167 280 PLANNERS,
cannam@167 281 SLVDESCS,
cannam@167 282 TWIDDLES,
cannam@167 283 STRIDES,
cannam@167 284 OTHER,
cannam@167 285 MALLOC_WHAT_LAST /* must be last */
cannam@167 286 };
cannam@167 287
cannam@167 288 IFFTW_EXTERN void X(ifree)(void *ptr);
cannam@167 289 extern void X(ifree0)(void *ptr);
cannam@167 290
cannam@167 291 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
cannam@167 292 #define MALLOC(n, what) X(malloc_plain)(n)
cannam@167 293
cannam@167 294 /*-----------------------------------------------------------------------*/
cannam@167 295 /* low-resolution clock */
cannam@167 296
cannam@167 297 #ifdef FAKE_CRUDE_TIME
cannam@167 298 typedef int crude_time;
cannam@167 299 #else
cannam@167 300 # if TIME_WITH_SYS_TIME
cannam@167 301 # include <sys/time.h>
cannam@167 302 # include <time.h>
cannam@167 303 # else
cannam@167 304 # if HAVE_SYS_TIME_H
cannam@167 305 # include <sys/time.h>
cannam@167 306 # else
cannam@167 307 # include <time.h>
cannam@167 308 # endif
cannam@167 309 # endif
cannam@167 310
cannam@167 311 # ifdef HAVE_BSDGETTIMEOFDAY
cannam@167 312 # ifndef HAVE_GETTIMEOFDAY
cannam@167 313 # define gettimeofday BSDgettimeofday
cannam@167 314 # define HAVE_GETTIMEOFDAY 1
cannam@167 315 # endif
cannam@167 316 # endif
cannam@167 317
cannam@167 318 # if defined(HAVE_GETTIMEOFDAY)
cannam@167 319 typedef struct timeval crude_time;
cannam@167 320 # else
cannam@167 321 typedef clock_t crude_time;
cannam@167 322 # endif
cannam@167 323 #endif /* else FAKE_CRUDE_TIME */
cannam@167 324
cannam@167 325 crude_time X(get_crude_time)(void);
cannam@167 326 double X(elapsed_since)(const planner *plnr, const problem *p,
cannam@167 327 crude_time t0); /* time in seconds since t0 */
cannam@167 328
cannam@167 329 /*-----------------------------------------------------------------------*/
cannam@167 330 /* ops.c: */
cannam@167 331 /*
cannam@167 332 * ops counter. The total number of additions is add + fma
cannam@167 333 * and the total number of multiplications is mul + fma.
cannam@167 334 * Total flops = add + mul + 2 * fma
cannam@167 335 */
cannam@167 336 typedef struct {
cannam@167 337 double add;
cannam@167 338 double mul;
cannam@167 339 double fma;
cannam@167 340 double other;
cannam@167 341 } opcnt;
cannam@167 342
cannam@167 343 void X(ops_zero)(opcnt *dst);
cannam@167 344 void X(ops_other)(INT o, opcnt *dst);
cannam@167 345 void X(ops_cpy)(const opcnt *src, opcnt *dst);
cannam@167 346
cannam@167 347 void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
cannam@167 348 void X(ops_add2)(const opcnt *a, opcnt *dst);
cannam@167 349
cannam@167 350 /* dst = m * a + b */
cannam@167 351 void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
cannam@167 352
cannam@167 353 /* dst += m * a */
cannam@167 354 void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
cannam@167 355
cannam@167 356
cannam@167 357 /*-----------------------------------------------------------------------*/
cannam@167 358 /* minmax.c: */
cannam@167 359 INT X(imax)(INT a, INT b);
cannam@167 360 INT X(imin)(INT a, INT b);
cannam@167 361
cannam@167 362 /*-----------------------------------------------------------------------*/
cannam@167 363 /* iabs.c: */
cannam@167 364 INT X(iabs)(INT a);
cannam@167 365
cannam@167 366 /* inline version */
cannam@167 367 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
cannam@167 368
cannam@167 369 /*-----------------------------------------------------------------------*/
cannam@167 370 /* md5.c */
cannam@167 371
cannam@167 372 #if SIZEOF_UNSIGNED_INT >= 4
cannam@167 373 typedef unsigned int md5uint;
cannam@167 374 #else
cannam@167 375 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
cannam@167 376 #endif
cannam@167 377
cannam@167 378 typedef md5uint md5sig[4];
cannam@167 379
cannam@167 380 typedef struct {
cannam@167 381 md5sig s; /* state and signature */
cannam@167 382
cannam@167 383 /* fields not meant to be used outside md5.c: */
cannam@167 384 unsigned char c[64]; /* stuff not yet processed */
cannam@167 385 unsigned l; /* total length. Should be 64 bits long, but this is
cannam@167 386 good enough for us */
cannam@167 387 } md5;
cannam@167 388
cannam@167 389 void X(md5begin)(md5 *p);
cannam@167 390 void X(md5putb)(md5 *p, const void *d_, size_t len);
cannam@167 391 void X(md5puts)(md5 *p, const char *s);
cannam@167 392 void X(md5putc)(md5 *p, unsigned char c);
cannam@167 393 void X(md5int)(md5 *p, int i);
cannam@167 394 void X(md5INT)(md5 *p, INT i);
cannam@167 395 void X(md5unsigned)(md5 *p, unsigned i);
cannam@167 396 void X(md5end)(md5 *p);
cannam@167 397
cannam@167 398 /*-----------------------------------------------------------------------*/
cannam@167 399 /* tensor.c: */
cannam@167 400 #define STRUCT_HACK_KR
cannam@167 401 #undef STRUCT_HACK_C99
cannam@167 402
cannam@167 403 typedef struct {
cannam@167 404 INT n;
cannam@167 405 INT is; /* input stride */
cannam@167 406 INT os; /* output stride */
cannam@167 407 } iodim;
cannam@167 408
cannam@167 409 typedef struct {
cannam@167 410 int rnk;
cannam@167 411 #if defined(STRUCT_HACK_KR)
cannam@167 412 iodim dims[1];
cannam@167 413 #elif defined(STRUCT_HACK_C99)
cannam@167 414 iodim dims[];
cannam@167 415 #else
cannam@167 416 iodim *dims;
cannam@167 417 #endif
cannam@167 418 } tensor;
cannam@167 419
cannam@167 420 /*
cannam@167 421 Definition of rank -infinity.
cannam@167 422 This definition has the property that if you want rank 0 or 1,
cannam@167 423 you can simply test for rank <= 1. This is a common case.
cannam@167 424
cannam@167 425 A tensor of rank -infinity has size 0.
cannam@167 426 */
cannam@167 427 #define RNK_MINFTY INT_MAX
cannam@167 428 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
cannam@167 429
cannam@167 430 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
cannam@167 431
cannam@167 432 tensor *X(mktensor)(int rnk);
cannam@167 433 tensor *X(mktensor_0d)(void);
cannam@167 434 tensor *X(mktensor_1d)(INT n, INT is, INT os);
cannam@167 435 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
cannam@167 436 INT n1, INT is1, INT os1);
cannam@167 437 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
cannam@167 438 INT n1, INT is1, INT os1,
cannam@167 439 INT n2, INT is2, INT os2);
cannam@167 440 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
cannam@167 441 INT n1, INT is1, INT os1,
cannam@167 442 INT n2, INT is2, INT os2,
cannam@167 443 INT n3, INT is3, INT os3);
cannam@167 444 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
cannam@167 445 INT n1, INT is1, INT os1,
cannam@167 446 INT n2, INT is2, INT os2,
cannam@167 447 INT n3, INT is3, INT os3,
cannam@167 448 INT n4, INT is4, INT os4);
cannam@167 449 INT X(tensor_sz)(const tensor *sz);
cannam@167 450 void X(tensor_md5)(md5 *p, const tensor *t);
cannam@167 451 INT X(tensor_max_index)(const tensor *sz);
cannam@167 452 INT X(tensor_min_istride)(const tensor *sz);
cannam@167 453 INT X(tensor_min_ostride)(const tensor *sz);
cannam@167 454 INT X(tensor_min_stride)(const tensor *sz);
cannam@167 455 int X(tensor_inplace_strides)(const tensor *sz);
cannam@167 456 int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
cannam@167 457 int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
cannam@167 458 inplace_kind k);
cannam@167 459 tensor *X(tensor_copy)(const tensor *sz);
cannam@167 460 int X(tensor_kosherp)(const tensor *x);
cannam@167 461
cannam@167 462 tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
cannam@167 463 tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
cannam@167 464 tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
cannam@167 465 tensor *X(tensor_compress)(const tensor *sz);
cannam@167 466 tensor *X(tensor_compress_contiguous)(const tensor *sz);
cannam@167 467 tensor *X(tensor_append)(const tensor *a, const tensor *b);
cannam@167 468 void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
cannam@167 469 int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
cannam@167 470 void X(tensor_destroy)(tensor *sz);
cannam@167 471 void X(tensor_destroy2)(tensor *a, tensor *b);
cannam@167 472 void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
cannam@167 473 void X(tensor_print)(const tensor *sz, printer *p);
cannam@167 474 int X(dimcmp)(const iodim *a, const iodim *b);
cannam@167 475 int X(tensor_equal)(const tensor *a, const tensor *b);
cannam@167 476 int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
cannam@167 477
cannam@167 478 /*-----------------------------------------------------------------------*/
cannam@167 479 /* problem.c: */
cannam@167 480 enum {
cannam@167 481 /* a problem that cannot be solved */
cannam@167 482 PROBLEM_UNSOLVABLE,
cannam@167 483
cannam@167 484 PROBLEM_DFT,
cannam@167 485 PROBLEM_RDFT,
cannam@167 486 PROBLEM_RDFT2,
cannam@167 487
cannam@167 488 /* for mpi/ subdirectory */
cannam@167 489 PROBLEM_MPI_DFT,
cannam@167 490 PROBLEM_MPI_RDFT,
cannam@167 491 PROBLEM_MPI_RDFT2,
cannam@167 492 PROBLEM_MPI_TRANSPOSE,
cannam@167 493
cannam@167 494 PROBLEM_LAST
cannam@167 495 };
cannam@167 496
cannam@167 497 typedef struct {
cannam@167 498 int problem_kind;
cannam@167 499 void (*hash) (const problem *ego, md5 *p);
cannam@167 500 void (*zero) (const problem *ego);
cannam@167 501 void (*print) (const problem *ego, printer *p);
cannam@167 502 void (*destroy) (problem *ego);
cannam@167 503 } problem_adt;
cannam@167 504
cannam@167 505 struct problem_s {
cannam@167 506 const problem_adt *adt;
cannam@167 507 };
cannam@167 508
cannam@167 509 problem *X(mkproblem)(size_t sz, const problem_adt *adt);
cannam@167 510 void X(problem_destroy)(problem *ego);
cannam@167 511 problem *X(mkproblem_unsolvable)(void);
cannam@167 512
cannam@167 513 /*-----------------------------------------------------------------------*/
cannam@167 514 /* print.c */
cannam@167 515 struct printer_s {
cannam@167 516 void (*print)(printer *p, const char *format, ...);
cannam@167 517 void (*vprint)(printer *p, const char *format, va_list ap);
cannam@167 518 void (*putchr)(printer *p, char c);
cannam@167 519 void (*cleanup)(printer *p);
cannam@167 520 int indent;
cannam@167 521 int indent_incr;
cannam@167 522 };
cannam@167 523
cannam@167 524 printer *X(mkprinter)(size_t size,
cannam@167 525 void (*putchr)(printer *p, char c),
cannam@167 526 void (*cleanup)(printer *p));
cannam@167 527 IFFTW_EXTERN void X(printer_destroy)(printer *p);
cannam@167 528
cannam@167 529 /*-----------------------------------------------------------------------*/
cannam@167 530 /* scan.c */
cannam@167 531 struct scanner_s {
cannam@167 532 int (*scan)(scanner *sc, const char *format, ...);
cannam@167 533 int (*vscan)(scanner *sc, const char *format, va_list ap);
cannam@167 534 int (*getchr)(scanner *sc);
cannam@167 535 int ungotc;
cannam@167 536 };
cannam@167 537
cannam@167 538 scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
cannam@167 539 void X(scanner_destroy)(scanner *sc);
cannam@167 540
cannam@167 541 /*-----------------------------------------------------------------------*/
cannam@167 542 /* plan.c: */
cannam@167 543
cannam@167 544 enum wakefulness {
cannam@167 545 SLEEPY,
cannam@167 546 AWAKE_ZERO,
cannam@167 547 AWAKE_SQRTN_TABLE,
cannam@167 548 AWAKE_SINCOS
cannam@167 549 };
cannam@167 550
cannam@167 551 typedef struct {
cannam@167 552 void (*solve)(const plan *ego, const problem *p);
cannam@167 553 void (*awake)(plan *ego, enum wakefulness wakefulness);
cannam@167 554 void (*print)(const plan *ego, printer *p);
cannam@167 555 void (*destroy)(plan *ego);
cannam@167 556 } plan_adt;
cannam@167 557
cannam@167 558 struct plan_s {
cannam@167 559 const plan_adt *adt;
cannam@167 560 opcnt ops;
cannam@167 561 double pcost;
cannam@167 562 enum wakefulness wakefulness; /* used for debugging only */
cannam@167 563 int could_prune_now_p;
cannam@167 564 };
cannam@167 565
cannam@167 566 plan *X(mkplan)(size_t size, const plan_adt *adt);
cannam@167 567 void X(plan_destroy_internal)(plan *ego);
cannam@167 568 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
cannam@167 569 void X(plan_null_destroy)(plan *ego);
cannam@167 570
cannam@167 571 /*-----------------------------------------------------------------------*/
cannam@167 572 /* solver.c: */
cannam@167 573 typedef struct {
cannam@167 574 int problem_kind;
cannam@167 575 plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
cannam@167 576 void (*destroy)(solver *ego);
cannam@167 577 } solver_adt;
cannam@167 578
cannam@167 579 struct solver_s {
cannam@167 580 const solver_adt *adt;
cannam@167 581 int refcnt;
cannam@167 582 };
cannam@167 583
cannam@167 584 solver *X(mksolver)(size_t size, const solver_adt *adt);
cannam@167 585 void X(solver_use)(solver *ego);
cannam@167 586 void X(solver_destroy)(solver *ego);
cannam@167 587 void X(solver_register)(planner *plnr, solver *s);
cannam@167 588
cannam@167 589 /* shorthand */
cannam@167 590 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
cannam@167 591
cannam@167 592 /*-----------------------------------------------------------------------*/
cannam@167 593 /* planner.c */
cannam@167 594
cannam@167 595 typedef struct slvdesc_s {
cannam@167 596 solver *slv;
cannam@167 597 const char *reg_nam;
cannam@167 598 unsigned nam_hash;
cannam@167 599 int reg_id;
cannam@167 600 int next_for_same_problem_kind;
cannam@167 601 } slvdesc;
cannam@167 602
cannam@167 603 typedef struct solution_s solution; /* opaque */
cannam@167 604
cannam@167 605 /* interpretation of L and U:
cannam@167 606
cannam@167 607 - if it returns a plan, the planner guarantees that all applicable
cannam@167 608 plans at least as impatient as U have been tried, and that each
cannam@167 609 plan in the solution is at least as impatient as L.
cannam@167 610
cannam@167 611 - if it returns 0, the planner guarantees to have tried all solvers
cannam@167 612 at least as impatient as L, and that none of them was applicable.
cannam@167 613
cannam@167 614 The structure is packed to fit into 64 bits.
cannam@167 615 */
cannam@167 616
cannam@167 617 typedef struct {
cannam@167 618 unsigned l:20;
cannam@167 619 unsigned hash_info:3;
cannam@167 620 # define BITS_FOR_TIMELIMIT 9
cannam@167 621 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
cannam@167 622 unsigned u:20;
cannam@167 623
cannam@167 624 /* abstraction break: we store the solver here to pad the
cannam@167 625 structure to 64 bits. Otherwise, the struct is padded to 64
cannam@167 626 bits anyway, and another word is allocated for slvndx. */
cannam@167 627 # define BITS_FOR_SLVNDX 12
cannam@167 628 unsigned slvndx:BITS_FOR_SLVNDX;
cannam@167 629 } flags_t;
cannam@167 630
cannam@167 631 /* impatience flags */
cannam@167 632 enum {
cannam@167 633 BELIEVE_PCOST = 0x0001,
cannam@167 634 ESTIMATE = 0x0002,
cannam@167 635 NO_DFT_R2HC = 0x0004,
cannam@167 636 NO_SLOW = 0x0008,
cannam@167 637 NO_VRECURSE = 0x0010,
cannam@167 638 NO_INDIRECT_OP = 0x0020,
cannam@167 639 NO_LARGE_GENERIC = 0x0040,
cannam@167 640 NO_RANK_SPLITS = 0x0080,
cannam@167 641 NO_VRANK_SPLITS = 0x0100,
cannam@167 642 NO_NONTHREADED = 0x0200,
cannam@167 643 NO_BUFFERING = 0x0400,
cannam@167 644 NO_FIXED_RADIX_LARGE_N = 0x0800,
cannam@167 645 NO_DESTROY_INPUT = 0x1000,
cannam@167 646 NO_SIMD = 0x2000,
cannam@167 647 CONSERVE_MEMORY = 0x4000,
cannam@167 648 NO_DHT_R2HC = 0x8000,
cannam@167 649 NO_UGLY = 0x10000,
cannam@167 650 ALLOW_PRUNING = 0x20000
cannam@167 651 };
cannam@167 652
cannam@167 653 /* hashtable information */
cannam@167 654 enum {
cannam@167 655 BLESSING = 0x1u, /* save this entry */
cannam@167 656 H_VALID = 0x2u, /* valid hastable entry */
cannam@167 657 H_LIVE = 0x4u /* entry is nonempty, implies H_VALID */
cannam@167 658 };
cannam@167 659
cannam@167 660 #define PLNR_L(plnr) ((plnr)->flags.l)
cannam@167 661 #define PLNR_U(plnr) ((plnr)->flags.u)
cannam@167 662 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
cannam@167 663
cannam@167 664 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
cannam@167 665 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
cannam@167 666 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
cannam@167 667
cannam@167 668 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
cannam@167 669 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
cannam@167 670 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
cannam@167 671 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
cannam@167 672 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
cannam@167 673 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
cannam@167 674 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
cannam@167 675 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
cannam@167 676 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
cannam@167 677 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
cannam@167 678 #define NO_NONTHREADEDP(plnr) \
cannam@167 679 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
cannam@167 680
cannam@167 681 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
cannam@167 682 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
cannam@167 683 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
cannam@167 684 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
cannam@167 685 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
cannam@167 686
cannam@167 687 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
cannam@167 688
cannam@167 689 typedef enum {
cannam@167 690 /* WISDOM_NORMAL: planner may or may not use wisdom */
cannam@167 691 WISDOM_NORMAL,
cannam@167 692
cannam@167 693 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
cannam@167 694 WISDOM_ONLY,
cannam@167 695
cannam@167 696 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
cannam@167 697 WISDOM_IS_BOGUS,
cannam@167 698
cannam@167 699 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
cannam@167 700 WISDOM_IGNORE_INFEASIBLE,
cannam@167 701
cannam@167 702 /* WISDOM_IGNORE_ALL: planner ignores all */
cannam@167 703 WISDOM_IGNORE_ALL
cannam@167 704 } wisdom_state_t;
cannam@167 705
cannam@167 706 typedef struct {
cannam@167 707 void (*register_solver)(planner *ego, solver *s);
cannam@167 708 plan *(*mkplan)(planner *ego, const problem *p);
cannam@167 709 void (*forget)(planner *ego, amnesia a);
cannam@167 710 void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
cannam@167 711 word in C++. */
cannam@167 712 int (*imprt)(planner *ego, scanner *sc);
cannam@167 713 } planner_adt;
cannam@167 714
cannam@167 715 /* hash table of solutions */
cannam@167 716 typedef struct {
cannam@167 717 solution *solutions;
cannam@167 718 unsigned hashsiz, nelem;
cannam@167 719
cannam@167 720 /* statistics */
cannam@167 721 int lookup, succ_lookup, lookup_iter;
cannam@167 722 int insert, insert_iter, insert_unknown;
cannam@167 723 int nrehash;
cannam@167 724 } hashtab;
cannam@167 725
cannam@167 726 typedef enum { COST_SUM, COST_MAX } cost_kind;
cannam@167 727
cannam@167 728 struct planner_s {
cannam@167 729 const planner_adt *adt;
cannam@167 730 void (*hook)(struct planner_s *plnr, plan *pln,
cannam@167 731 const problem *p, int optimalp);
cannam@167 732 double (*cost_hook)(const problem *p, double t, cost_kind k);
cannam@167 733 int (*wisdom_ok_hook)(const problem *p, flags_t flags);
cannam@167 734 void (*nowisdom_hook)(const problem *p);
cannam@167 735 wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
cannam@167 736
cannam@167 737 /* solver descriptors */
cannam@167 738 slvdesc *slvdescs;
cannam@167 739 unsigned nslvdesc, slvdescsiz;
cannam@167 740 const char *cur_reg_nam;
cannam@167 741 int cur_reg_id;
cannam@167 742 int slvdescs_for_problem_kind[PROBLEM_LAST];
cannam@167 743
cannam@167 744 wisdom_state_t wisdom_state;
cannam@167 745
cannam@167 746 hashtab htab_blessed;
cannam@167 747 hashtab htab_unblessed;
cannam@167 748
cannam@167 749 int nthr;
cannam@167 750 flags_t flags;
cannam@167 751
cannam@167 752 crude_time start_time;
cannam@167 753 double timelimit; /* elapsed_since(start_time) at which to bail out */
cannam@167 754 int timed_out; /* whether most recent search timed out */
cannam@167 755 int need_timeout_check;
cannam@167 756
cannam@167 757 /* various statistics */
cannam@167 758 int nplan; /* number of plans evaluated */
cannam@167 759 double pcost, epcost; /* total pcost of measured/estimated plans */
cannam@167 760 int nprob; /* number of problems evaluated */
cannam@167 761 };
cannam@167 762
cannam@167 763 planner *X(mkplanner)(void);
cannam@167 764 void X(planner_destroy)(planner *ego);
cannam@167 765
cannam@167 766 /*
cannam@167 767 Iterate over all solvers. Read:
cannam@167 768
cannam@167 769 @article{ baker93iterators,
cannam@167 770 author = "Henry G. Baker, Jr.",
cannam@167 771 title = "Iterators: Signs of Weakness in Object-Oriented Languages",
cannam@167 772 journal = "{ACM} {OOPS} Messenger",
cannam@167 773 volume = "4",
cannam@167 774 number = "3",
cannam@167 775 pages = "18--25"
cannam@167 776 }
cannam@167 777 */
cannam@167 778 #define FORALL_SOLVERS(ego, s, p, what) \
cannam@167 779 { \
cannam@167 780 unsigned _cnt; \
cannam@167 781 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
cannam@167 782 slvdesc *p = ego->slvdescs + _cnt; \
cannam@167 783 solver *s = p->slv; \
cannam@167 784 what; \
cannam@167 785 } \
cannam@167 786 }
cannam@167 787
cannam@167 788 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
cannam@167 789 { \
cannam@167 790 int _cnt = ego->slvdescs_for_problem_kind[kind]; \
cannam@167 791 while (_cnt >= 0) { \
cannam@167 792 slvdesc *p = ego->slvdescs + _cnt; \
cannam@167 793 solver *s = p->slv; \
cannam@167 794 what; \
cannam@167 795 _cnt = p->next_for_same_problem_kind; \
cannam@167 796 } \
cannam@167 797 }
cannam@167 798
cannam@167 799
cannam@167 800 /* make plan, destroy problem */
cannam@167 801 plan *X(mkplan_d)(planner *ego, problem *p);
cannam@167 802 plan *X(mkplan_f_d)(planner *ego, problem *p,
cannam@167 803 unsigned l_set, unsigned u_set, unsigned u_reset);
cannam@167 804
cannam@167 805 /*-----------------------------------------------------------------------*/
cannam@167 806 /* stride.c: */
cannam@167 807
cannam@167 808 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
cannam@167 809 #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
cannam@167 810 #define PRECOMPUTE_ARRAY_INDICES
cannam@167 811 #endif
cannam@167 812
cannam@167 813 extern const INT X(an_INT_guaranteed_to_be_zero);
cannam@167 814
cannam@167 815 #ifdef PRECOMPUTE_ARRAY_INDICES
cannam@167 816 typedef INT *stride;
cannam@167 817 #define WS(stride, i) (stride[i])
cannam@167 818 extern stride X(mkstride)(INT n, INT s);
cannam@167 819 void X(stride_destroy)(stride p);
cannam@167 820 /* hackery to prevent the compiler from copying the strides array
cannam@167 821 onto the stack */
cannam@167 822 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
cannam@167 823 #else
cannam@167 824
cannam@167 825 typedef INT stride;
cannam@167 826 #define WS(stride, i) (stride * i)
cannam@167 827 #define fftwf_mkstride(n, stride) stride
cannam@167 828 #define fftw_mkstride(n, stride) stride
cannam@167 829 #define fftwl_mkstride(n, stride) stride
cannam@167 830 #define fftwf_stride_destroy(p) ((void) p)
cannam@167 831 #define fftw_stride_destroy(p) ((void) p)
cannam@167 832 #define fftwl_stride_destroy(p) ((void) p)
cannam@167 833
cannam@167 834 /* hackery to prevent the compiler from ``optimizing'' induction
cannam@167 835 variables in codelet loops. The problem is that for each K and for
cannam@167 836 each expression of the form P[I + STRIDE * K] in a loop, most
cannam@167 837 compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
cannam@167 838 For large values of K this behavior overflows the
cannam@167 839 register set, which is likely worse than doing the index computation
cannam@167 840 in the first place.
cannam@167 841
cannam@167 842 If we guess that there are more than
cannam@167 843 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
cannam@167 844 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
cannam@167 845 be 0, but the compiler does not know this.
cannam@167 846
cannam@167 847 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
cannam@167 848 seem to imply.
cannam@167 849 */
cannam@167 850 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
cannam@167 851 #define MAKE_VOLATILE_STRIDE(nptr, x) \
cannam@167 852 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
cannam@167 853 0 : \
cannam@167 854 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
cannam@167 855 #endif /* PRECOMPUTE_ARRAY_INDICES */
cannam@167 856
cannam@167 857 /*-----------------------------------------------------------------------*/
cannam@167 858 /* solvtab.c */
cannam@167 859
cannam@167 860 struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
cannam@167 861 typedef struct solvtab_s solvtab[];
cannam@167 862 void X(solvtab_exec)(const solvtab tbl, planner *p);
cannam@167 863 #define SOLVTAB(s) { s, STRINGIZE(s) }
cannam@167 864 #define SOLVTAB_END { 0, 0 }
cannam@167 865
cannam@167 866 /*-----------------------------------------------------------------------*/
cannam@167 867 /* pickdim.c */
cannam@167 868 int X(pickdim)(int which_dim, const int *buddies, size_t nbuddies,
cannam@167 869 const tensor *sz, int oop, int *dp);
cannam@167 870
cannam@167 871 /*-----------------------------------------------------------------------*/
cannam@167 872 /* twiddle.c */
cannam@167 873 /* little language to express twiddle factors computation */
cannam@167 874 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
cannam@167 875 TW_FULL = 4, TW_HALF = 5 };
cannam@167 876
cannam@167 877 typedef struct {
cannam@167 878 unsigned char op;
cannam@167 879 signed char v;
cannam@167 880 short i;
cannam@167 881 } tw_instr;
cannam@167 882
cannam@167 883 typedef struct twid_s {
cannam@167 884 R *W; /* array of twiddle factors */
cannam@167 885 INT n, r, m; /* transform order, radix, # twiddle rows */
cannam@167 886 int refcnt;
cannam@167 887 const tw_instr *instr;
cannam@167 888 struct twid_s *cdr;
cannam@167 889 enum wakefulness wakefulness;
cannam@167 890 } twid;
cannam@167 891
cannam@167 892 INT X(twiddle_length)(INT r, const tw_instr *p);
cannam@167 893 void X(twiddle_awake)(enum wakefulness wakefulness,
cannam@167 894 twid **pp, const tw_instr *instr, INT n, INT r, INT m);
cannam@167 895
cannam@167 896 /*-----------------------------------------------------------------------*/
cannam@167 897 /* trig.c */
cannam@167 898 #if defined(TRIGREAL_IS_LONG_DOUBLE)
cannam@167 899 typedef long double trigreal;
cannam@167 900 #elif defined(TRIGREAL_IS_QUAD)
cannam@167 901 typedef __float128 trigreal;
cannam@167 902 #else
cannam@167 903 typedef double trigreal;
cannam@167 904 #endif
cannam@167 905
cannam@167 906 typedef struct triggen_s triggen;
cannam@167 907
cannam@167 908 struct triggen_s {
cannam@167 909 void (*cexp)(triggen *t, INT m, R *result);
cannam@167 910 void (*cexpl)(triggen *t, INT m, trigreal *result);
cannam@167 911 void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
cannam@167 912
cannam@167 913 INT twshft;
cannam@167 914 INT twradix;
cannam@167 915 INT twmsk;
cannam@167 916 trigreal *W0, *W1;
cannam@167 917 INT n;
cannam@167 918 };
cannam@167 919
cannam@167 920 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
cannam@167 921 void X(triggen_destroy)(triggen *p);
cannam@167 922
cannam@167 923 /*-----------------------------------------------------------------------*/
cannam@167 924 /* primes.c: */
cannam@167 925
cannam@167 926 #define MULMOD(x, y, p) \
cannam@167 927 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
cannam@167 928
cannam@167 929 INT X(safe_mulmod)(INT x, INT y, INT p);
cannam@167 930 INT X(power_mod)(INT n, INT m, INT p);
cannam@167 931 INT X(find_generator)(INT p);
cannam@167 932 INT X(first_divisor)(INT n);
cannam@167 933 int X(is_prime)(INT n);
cannam@167 934 INT X(next_prime)(INT n);
cannam@167 935 int X(factors_into)(INT n, const INT *primes);
cannam@167 936 int X(factors_into_small_primes)(INT n);
cannam@167 937 INT X(choose_radix)(INT r, INT n);
cannam@167 938 INT X(isqrt)(INT n);
cannam@167 939 INT X(modulo)(INT a, INT n);
cannam@167 940
cannam@167 941 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
cannam@167 942
cannam@167 943 /* thresholds below which certain solvers are considered SLOW. These are guesses
cannam@167 944 believed to be conservative */
cannam@167 945 #define GENERIC_MAX_SLOW 16
cannam@167 946 #define RADER_MAX_SLOW 32
cannam@167 947 #define BLUESTEIN_MAX_SLOW 24
cannam@167 948
cannam@167 949 /*-----------------------------------------------------------------------*/
cannam@167 950 /* rader.c: */
cannam@167 951 typedef struct rader_tls rader_tl;
cannam@167 952
cannam@167 953 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
cannam@167 954 R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
cannam@167 955 void X(rader_tl_delete)(R *W, rader_tl **tl);
cannam@167 956
cannam@167 957 /*-----------------------------------------------------------------------*/
cannam@167 958 /* copy/transposition routines */
cannam@167 959
cannam@167 960 /* lower bound to the cache size, for tiled routines */
cannam@167 961 #define CACHESIZE 8192
cannam@167 962
cannam@167 963 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
cannam@167 964
cannam@167 965 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
cannam@167 966 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
cannam@167 967 void *args);
cannam@167 968 void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
cannam@167 969 void X(zero1d_pair)(R *O0, R *O1, INT n0, INT os0);
cannam@167 970 void X(cpy2d)(R *I, R *O,
cannam@167 971 INT n0, INT is0, INT os0,
cannam@167 972 INT n1, INT is1, INT os1,
cannam@167 973 INT vl);
cannam@167 974 void X(cpy2d_ci)(R *I, R *O,
cannam@167 975 INT n0, INT is0, INT os0,
cannam@167 976 INT n1, INT is1, INT os1,
cannam@167 977 INT vl);
cannam@167 978 void X(cpy2d_co)(R *I, R *O,
cannam@167 979 INT n0, INT is0, INT os0,
cannam@167 980 INT n1, INT is1, INT os1,
cannam@167 981 INT vl);
cannam@167 982 void X(cpy2d_tiled)(R *I, R *O,
cannam@167 983 INT n0, INT is0, INT os0,
cannam@167 984 INT n1, INT is1, INT os1,
cannam@167 985 INT vl);
cannam@167 986 void X(cpy2d_tiledbuf)(R *I, R *O,
cannam@167 987 INT n0, INT is0, INT os0,
cannam@167 988 INT n1, INT is1, INT os1,
cannam@167 989 INT vl);
cannam@167 990 void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
cannam@167 991 INT n0, INT is0, INT os0,
cannam@167 992 INT n1, INT is1, INT os1);
cannam@167 993 void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
cannam@167 994 INT n0, INT is0, INT os0,
cannam@167 995 INT n1, INT is1, INT os1);
cannam@167 996 void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
cannam@167 997 INT n0, INT is0, INT os0,
cannam@167 998 INT n1, INT is1, INT os1);
cannam@167 999
cannam@167 1000 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
cannam@167 1001 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
cannam@167 1002 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
cannam@167 1003
cannam@167 1004 typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
cannam@167 1005 typedef void (*cpy2d_func)(R *I, R *O,
cannam@167 1006 INT n0, INT is0, INT os0,
cannam@167 1007 INT n1, INT is1, INT os1,
cannam@167 1008 INT vl);
cannam@167 1009
cannam@167 1010 /*-----------------------------------------------------------------------*/
cannam@167 1011 /* misc stuff */
cannam@167 1012 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
cannam@167 1013 double X(iestimate_cost)(const planner *, const plan *, const problem *);
cannam@167 1014
cannam@167 1015 #ifdef FFTW_RANDOM_ESTIMATOR
cannam@167 1016 extern unsigned X(random_estimate_seed);
cannam@167 1017 #endif
cannam@167 1018
cannam@167 1019 double X(measure_execution_time)(const planner *plnr,
cannam@167 1020 plan *pln, const problem *p);
cannam@167 1021 IFFTW_EXTERN int X(ialignment_of)(R *p);
cannam@167 1022 unsigned X(hash)(const char *s);
cannam@167 1023 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
cannam@167 1024 int X(nbuf_redundant)(INT n, INT vl, size_t which,
cannam@167 1025 const INT *maxnbuf, size_t nmaxnbuf);
cannam@167 1026 INT X(bufdist)(INT n, INT vl);
cannam@167 1027 int X(toobig)(INT n);
cannam@167 1028 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
cannam@167 1029
cannam@167 1030 #if HAVE_SIMD
cannam@167 1031 R *X(taint)(R *p, INT s);
cannam@167 1032 R *X(join_taint)(R *p1, R *p2);
cannam@167 1033 #define TAINT(p, s) X(taint)(p, s)
cannam@167 1034 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
cannam@167 1035 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
cannam@167 1036 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
cannam@167 1037 #else
cannam@167 1038 #define TAINT(p, s) (p)
cannam@167 1039 #define UNTAINT(p) (p)
cannam@167 1040 #define TAINTOF(p) 0
cannam@167 1041 #define JOIN_TAINT(p1, p2) p1
cannam@167 1042 #endif
cannam@167 1043
cannam@167 1044 #define ASSERT_ALIGNED_DOUBLE /*unused, legacy*/
cannam@167 1045
cannam@167 1046 /*-----------------------------------------------------------------------*/
cannam@167 1047 /* macros used in codelets to reduce source code size */
cannam@167 1048
cannam@167 1049 typedef R E; /* internal precision of codelets. */
cannam@167 1050
cannam@167 1051 #if defined(FFTW_LDOUBLE)
cannam@167 1052 # define K(x) ((E) x##L)
cannam@167 1053 #elif defined(FFTW_QUAD)
cannam@167 1054 # define K(x) ((E) x##Q)
cannam@167 1055 #else
cannam@167 1056 # define K(x) ((E) x)
cannam@167 1057 #endif
cannam@167 1058 #define DK(name, value) const E name = K(value)
cannam@167 1059
cannam@167 1060 /* FMA macros */
cannam@167 1061
cannam@167 1062 #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
cannam@167 1063 /* The obvious expression a * b + c does not work. If both x = a * b
cannam@167 1064 + c and y = a * b - c appear in the source, gcc computes t = a * b,
cannam@167 1065 x = t + c, y = t - c, thus destroying the fma.
cannam@167 1066
cannam@167 1067 This peculiar coding seems to do the right thing on all of
cannam@167 1068 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
cannam@167 1069 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
cannam@167 1070 `x' for the single-assignment form).
cannam@167 1071
cannam@167 1072 However, gcc-4.0 is a formidable adversary which succeeds in
cannam@167 1073 pessimizing two fma's into one multiplication and two additions.
cannam@167 1074 It does it very early in the game---before the optimization passes
cannam@167 1075 even start. The only real workaround seems to use fake inline asm
cannam@167 1076 such as
cannam@167 1077
cannam@167 1078 asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
cannam@167 1079 return a * b + c;
cannam@167 1080
cannam@167 1081 in each of the FMA, FMS, FNMA, and FNMS functions. However, this
cannam@167 1082 does not solve the problem either, because two equal asm statements
cannam@167 1083 count as a common subexpression! One must use *different* fake asm
cannam@167 1084 statements:
cannam@167 1085
cannam@167 1086 in FMA:
cannam@167 1087 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
cannam@167 1088
cannam@167 1089 in FMS:
cannam@167 1090 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
cannam@167 1091
cannam@167 1092 etc.
cannam@167 1093
cannam@167 1094 After these changes, gcc recalcitrantly generates the fma that was
cannam@167 1095 in the source to begin with. However, the extra asm() cruft
cannam@167 1096 confuses other passes of gcc, notably the instruction scheduler.
cannam@167 1097 (Of course, one could also generate the fma directly via inline
cannam@167 1098 asm, but this confuses the scheduler even more.)
cannam@167 1099
cannam@167 1100 Steven and I have submitted more than one bug report to the gcc
cannam@167 1101 mailing list over the past few years, to no effect. Thus, I give
cannam@167 1102 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
cannam@167 1103 out before touching this crap again.
cannam@167 1104 */
cannam@167 1105 static __inline__ E FMA(E a, E b, E c)
cannam@167 1106 {
cannam@167 1107 E x = a * b;
cannam@167 1108 x = x + c;
cannam@167 1109 return x;
cannam@167 1110 }
cannam@167 1111
cannam@167 1112 static __inline__ E FMS(E a, E b, E c)
cannam@167 1113 {
cannam@167 1114 E x = a * b;
cannam@167 1115 x = x - c;
cannam@167 1116 return x;
cannam@167 1117 }
cannam@167 1118
cannam@167 1119 static __inline__ E FNMA(E a, E b, E c)
cannam@167 1120 {
cannam@167 1121 E x = a * b;
cannam@167 1122 x = - (x + c);
cannam@167 1123 return x;
cannam@167 1124 }
cannam@167 1125
cannam@167 1126 static __inline__ E FNMS(E a, E b, E c)
cannam@167 1127 {
cannam@167 1128 E x = a * b;
cannam@167 1129 x = - (x - c);
cannam@167 1130 return x;
cannam@167 1131 }
cannam@167 1132 #else
cannam@167 1133 #define FMA(a, b, c) (((a) * (b)) + (c))
cannam@167 1134 #define FMS(a, b, c) (((a) * (b)) - (c))
cannam@167 1135 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
cannam@167 1136 #define FNMS(a, b, c) ((c) - ((a) * (b)))
cannam@167 1137 #endif
cannam@167 1138
cannam@167 1139 #ifdef __cplusplus
cannam@167 1140 } /* extern "C" */
cannam@167 1141 #endif /* __cplusplus */
cannam@167 1142
cannam@167 1143 #endif /* __IFFTW_H__ */