comparison src/fftw-3.3.5/kernel/ifftw.h @ 42:2cd0e3b3e1fd

Current fftw source
author Chris Cannam
date Tue, 18 Oct 2016 13:40:26 +0100
parents
children
comparison
equal deleted inserted replaced
41:481f5f8c5634 42:2cd0e3b3e1fd
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21
22 /* FFTW internal header file */
23 #ifndef __IFFTW_H__
24 #define __IFFTW_H__
25
26 #include "config.h"
27
28 #include <stdlib.h> /* size_t */
29 #include <stdarg.h> /* va_list */
30 #include <stddef.h> /* ptrdiff_t */
31 #include <limits.h> /* INT_MAX */
32
33 #if HAVE_SYS_TYPES_H
34 # include <sys/types.h>
35 #endif
36
37 #if HAVE_STDINT_H
38 # include <stdint.h> /* uintptr_t, maybe */
39 #endif
40
41 #if HAVE_INTTYPES_H
42 # include <inttypes.h> /* uintptr_t, maybe */
43 #endif
44
45 #ifdef __cplusplus
46 extern "C"
47 {
48 #endif /* __cplusplus */
49
50 /* Windows annoyances -- since tests/hook.c uses some internal
51 FFTW functions, we need to given them the dllexport attribute
52 under Windows when compiling as a DLL (see api/fftw3.h). */
53 #if defined(FFTW_EXTERN)
54 # define IFFTW_EXTERN FFTW_EXTERN
55 #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
56 && (defined(_WIN32) || defined(__WIN32__))
57 # define IFFTW_EXTERN extern __declspec(dllexport)
58 #else
59 # define IFFTW_EXTERN extern
60 #endif
61
62 /* determine precision and name-mangling scheme */
63 #define CONCAT(prefix, name) prefix ## name
64 #if defined(FFTW_SINGLE)
65 typedef float R;
66 # define X(name) CONCAT(fftwf_, name)
67 #elif defined(FFTW_LDOUBLE)
68 typedef long double R;
69 # define X(name) CONCAT(fftwl_, name)
70 # define TRIGREAL_IS_LONG_DOUBLE
71 #elif defined(FFTW_QUAD)
72 typedef __float128 R;
73 # define X(name) CONCAT(fftwq_, name)
74 # define TRIGREAL_IS_QUAD
75 #else
76 typedef double R;
77 # define X(name) CONCAT(fftw_, name)
78 #endif
79
80 /*
81 integral type large enough to contain a stride (what ``int'' should
82 have been in the first place.
83 */
84 typedef ptrdiff_t INT;
85
86 /* dummy use of unused parameters to silence compiler warnings */
87 #define UNUSED(x) (void)x
88
89 #define NELEM(array) ((sizeof(array) / sizeof((array)[0])))
90
91 #define FFT_SIGN (-1) /* sign convention for forward transforms */
92 extern void X(extract_reim)(int sign, R *c, R **r, R **i);
93
94 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
95
96 #define STRINGIZEx(x) #x
97 #define STRINGIZE(x) STRINGIZEx(x)
98 #define CIMPLIES(ante, post) (!(ante) || (post))
99
100 /* define HAVE_SIMD if any simd extensions are supported */
101 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || \
102 defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \
103 defined(HAVE_AVX2) || defined(HAVE_AVX512) || \
104 defined(HAVE_KCVI) || \
105 defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \
106 defined(HAVE_MIPS_PS) || \
107 defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256)
108 #define HAVE_SIMD 1
109 #else
110 #define HAVE_SIMD 0
111 #endif
112
113 extern int X(have_simd_sse2)(void);
114 extern int X(have_simd_avx)(void);
115 extern int X(have_simd_avx_128_fma)(void);
116 extern int X(have_simd_avx2)(void);
117 extern int X(have_simd_avx2_128)(void);
118 extern int X(have_simd_avx512)(void);
119 extern int X(have_simd_altivec)(void);
120 extern int X(have_simd_vsx)(void);
121 extern int X(have_simd_neon)(void);
122
123 /* forward declarations */
124 typedef struct problem_s problem;
125 typedef struct plan_s plan;
126 typedef struct solver_s solver;
127 typedef struct planner_s planner;
128 typedef struct printer_s printer;
129 typedef struct scanner_s scanner;
130
131 /*-----------------------------------------------------------------------*/
132 /* alloca: */
133 #if HAVE_SIMD
134 # if defined(HAVE_KCVI) || defined(HAVE_AVX512)
135 # define MIN_ALIGNMENT 64
136 # elif defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_GENERIC_SIMD256)
137 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
138 * everything else */
139 # else
140 /* Note that we cannot use 32-byte alignment for all SIMD. For
141 example, MacOS X malloc is 16-byte aligned, but there was no
142 posix_memalign in MacOS X until version 10.6. */
143 # define MIN_ALIGNMENT 16
144 # endif
145 #endif
146
147 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
148 /* use alloca if available */
149
150 #ifndef alloca
151 #ifdef __GNUC__
152 # define alloca __builtin_alloca
153 #else
154 # ifdef _MSC_VER
155 # include <malloc.h>
156 # define alloca _alloca
157 # else
158 # if HAVE_ALLOCA_H
159 # include <alloca.h>
160 # else
161 # ifdef _AIX
162 #pragma alloca
163 # else
164 # ifndef alloca /* predefined by HP cc +Olibcalls */
165 void *alloca(size_t);
166 # endif
167 # endif
168 # endif
169 # endif
170 #endif
171 #endif
172
173 # ifdef MIN_ALIGNMENT
174 # define STACK_MALLOC(T, p, n) \
175 { \
176 p = (T)alloca((n) + MIN_ALIGNMENT); \
177 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
178 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \
179 }
180 # define STACK_FREE(n)
181 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
182 # define STACK_MALLOC(T, p, n) p = (T)alloca(n)
183 # define STACK_FREE(n)
184 # endif
185
186 #else /* ! HAVE_ALLOCA */
187 /* use malloc instead of alloca */
188 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
189 # define STACK_FREE(n) X(ifree)(n)
190 #endif /* ! HAVE_ALLOCA */
191
192 /* allocation of buffers. If these grow too large use malloc(), else
193 use STACK_MALLOC (hopefully reducing to alloca()). */
194
195 /* 64KiB ought to be enough for anybody */
196 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
197
198 #define BUF_ALLOC(T, p, n) \
199 { \
200 if (n < MAX_STACK_ALLOC) { \
201 STACK_MALLOC(T, p, n); \
202 } else { \
203 p = (T)MALLOC(n, BUFFERS); \
204 } \
205 }
206
207 #define BUF_FREE(p, n) \
208 { \
209 if (n < MAX_STACK_ALLOC) { \
210 STACK_FREE(p); \
211 } else { \
212 X(ifree)(p); \
213 } \
214 }
215
216 /*-----------------------------------------------------------------------*/
217 /* define uintptr_t if it is not already defined */
218
219 #ifndef HAVE_UINTPTR_T
220 # if SIZEOF_VOID_P == 0
221 # error sizeof void* is unknown!
222 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
223 typedef unsigned int uintptr_t;
224 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
225 typedef unsigned long uintptr_t;
226 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
227 typedef unsigned long long uintptr_t;
228 # else
229 # error no unsigned integer type matches void* sizeof!
230 # endif
231 #endif
232
233 /*-----------------------------------------------------------------------*/
234 /* We can do an optimization for copying pairs of (aligned) floats
235 when in single precision if 2*float = double. */
236
237 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
238 && SIZEOF_FLOAT != 0 \
239 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
240
241 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
242
243 /*-----------------------------------------------------------------------*/
244 /* assert.c: */
245 IFFTW_EXTERN void X(assertion_failed)(const char *s,
246 int line, const char *file);
247
248 /* always check */
249 #define CK(ex) \
250 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
251
252 #ifdef FFTW_DEBUG
253 /* check only if debug enabled */
254 #define A(ex) \
255 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
256 #else
257 #define A(ex) /* nothing */
258 #endif
259
260 extern void X(debug)(const char *format, ...);
261 #define D X(debug)
262
263 /*-----------------------------------------------------------------------*/
264 /* kalloc.c: */
265 extern void *X(kernel_malloc)(size_t n);
266 extern void X(kernel_free)(void *p);
267
268 /*-----------------------------------------------------------------------*/
269 /* alloc.c: */
270
271 /* objects allocated by malloc, for statistical purposes */
272 enum malloc_tag {
273 EVERYTHING,
274 PLANS,
275 SOLVERS,
276 PROBLEMS,
277 BUFFERS,
278 HASHT,
279 TENSORS,
280 PLANNERS,
281 SLVDESCS,
282 TWIDDLES,
283 STRIDES,
284 OTHER,
285 MALLOC_WHAT_LAST /* must be last */
286 };
287
288 IFFTW_EXTERN void X(ifree)(void *ptr);
289 extern void X(ifree0)(void *ptr);
290
291 #ifdef FFTW_DEBUG_MALLOC
292
293 IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what,
294 const char *file, int line);
295 #define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
296 IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose);
297
298 #else /* ! FFTW_DEBUG_MALLOC */
299
300 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
301 #define MALLOC(n, what) X(malloc_plain)(n)
302
303 #endif
304
305 #if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) || defined(HAVE_OPENMP))
306 extern int X(in_thread);
307 # define IN_THREAD X(in_thread)
308 # define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
309 # define THREAD_OFF X(in_thread) = in_thread_save; }
310 #else
311 # define IN_THREAD 0
312 # define THREAD_ON
313 # define THREAD_OFF
314 #endif
315
316 /*-----------------------------------------------------------------------*/
317 /* low-resolution clock */
318
319 #ifdef FAKE_CRUDE_TIME
320 typedef int crude_time;
321 #else
322 # if TIME_WITH_SYS_TIME
323 # include <sys/time.h>
324 # include <time.h>
325 # else
326 # if HAVE_SYS_TIME_H
327 # include <sys/time.h>
328 # else
329 # include <time.h>
330 # endif
331 # endif
332
333 # ifdef HAVE_BSDGETTIMEOFDAY
334 # ifndef HAVE_GETTIMEOFDAY
335 # define gettimeofday BSDgettimeofday
336 # define HAVE_GETTIMEOFDAY 1
337 # endif
338 # endif
339
340 # if defined(HAVE_GETTIMEOFDAY)
341 typedef struct timeval crude_time;
342 # else
343 typedef clock_t crude_time;
344 # endif
345 #endif /* else FAKE_CRUDE_TIME */
346
347 crude_time X(get_crude_time)(void);
348 double X(elapsed_since)(const planner *plnr, const problem *p,
349 crude_time t0); /* time in seconds since t0 */
350
351 /*-----------------------------------------------------------------------*/
352 /* ops.c: */
353 /*
354 * ops counter. The total number of additions is add + fma
355 * and the total number of multiplications is mul + fma.
356 * Total flops = add + mul + 2 * fma
357 */
358 typedef struct {
359 double add;
360 double mul;
361 double fma;
362 double other;
363 } opcnt;
364
365 void X(ops_zero)(opcnt *dst);
366 void X(ops_other)(INT o, opcnt *dst);
367 void X(ops_cpy)(const opcnt *src, opcnt *dst);
368
369 void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
370 void X(ops_add2)(const opcnt *a, opcnt *dst);
371
372 /* dst = m * a + b */
373 void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
374
375 /* dst += m * a */
376 void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
377
378
379 /*-----------------------------------------------------------------------*/
380 /* minmax.c: */
381 INT X(imax)(INT a, INT b);
382 INT X(imin)(INT a, INT b);
383
384 /*-----------------------------------------------------------------------*/
385 /* iabs.c: */
386 INT X(iabs)(INT a);
387
388 /* inline version */
389 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
390
391 /*-----------------------------------------------------------------------*/
392 /* md5.c */
393
394 #if SIZEOF_UNSIGNED_INT >= 4
395 typedef unsigned int md5uint;
396 #else
397 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
398 #endif
399
400 typedef md5uint md5sig[4];
401
402 typedef struct {
403 md5sig s; /* state and signature */
404
405 /* fields not meant to be used outside md5.c: */
406 unsigned char c[64]; /* stuff not yet processed */
407 unsigned l; /* total length. Should be 64 bits long, but this is
408 good enough for us */
409 } md5;
410
411 void X(md5begin)(md5 *p);
412 void X(md5putb)(md5 *p, const void *d_, size_t len);
413 void X(md5puts)(md5 *p, const char *s);
414 void X(md5putc)(md5 *p, unsigned char c);
415 void X(md5int)(md5 *p, int i);
416 void X(md5INT)(md5 *p, INT i);
417 void X(md5unsigned)(md5 *p, unsigned i);
418 void X(md5end)(md5 *p);
419
420 /*-----------------------------------------------------------------------*/
421 /* tensor.c: */
422 #define STRUCT_HACK_KR
423 #undef STRUCT_HACK_C99
424
425 typedef struct {
426 INT n;
427 INT is; /* input stride */
428 INT os; /* output stride */
429 } iodim;
430
431 typedef struct {
432 int rnk;
433 #if defined(STRUCT_HACK_KR)
434 iodim dims[1];
435 #elif defined(STRUCT_HACK_C99)
436 iodim dims[];
437 #else
438 iodim *dims;
439 #endif
440 } tensor;
441
442 /*
443 Definition of rank -infinity.
444 This definition has the property that if you want rank 0 or 1,
445 you can simply test for rank <= 1. This is a common case.
446
447 A tensor of rank -infinity has size 0.
448 */
449 #define RNK_MINFTY INT_MAX
450 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
451
452 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
453
454 tensor *X(mktensor)(int rnk);
455 tensor *X(mktensor_0d)(void);
456 tensor *X(mktensor_1d)(INT n, INT is, INT os);
457 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
458 INT n1, INT is1, INT os1);
459 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
460 INT n1, INT is1, INT os1,
461 INT n2, INT is2, INT os2);
462 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
463 INT n1, INT is1, INT os1,
464 INT n2, INT is2, INT os2,
465 INT n3, INT is3, INT os3);
466 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
467 INT n1, INT is1, INT os1,
468 INT n2, INT is2, INT os2,
469 INT n3, INT is3, INT os3,
470 INT n4, INT is4, INT os4);
471 INT X(tensor_sz)(const tensor *sz);
472 void X(tensor_md5)(md5 *p, const tensor *t);
473 INT X(tensor_max_index)(const tensor *sz);
474 INT X(tensor_min_istride)(const tensor *sz);
475 INT X(tensor_min_ostride)(const tensor *sz);
476 INT X(tensor_min_stride)(const tensor *sz);
477 int X(tensor_inplace_strides)(const tensor *sz);
478 int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
479 int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
480 inplace_kind k);
481 tensor *X(tensor_copy)(const tensor *sz);
482 int X(tensor_kosherp)(const tensor *x);
483
484 tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
485 tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
486 tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
487 tensor *X(tensor_compress)(const tensor *sz);
488 tensor *X(tensor_compress_contiguous)(const tensor *sz);
489 tensor *X(tensor_append)(const tensor *a, const tensor *b);
490 void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
491 int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
492 void X(tensor_destroy)(tensor *sz);
493 void X(tensor_destroy2)(tensor *a, tensor *b);
494 void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
495 void X(tensor_print)(const tensor *sz, printer *p);
496 int X(dimcmp)(const iodim *a, const iodim *b);
497 int X(tensor_equal)(const tensor *a, const tensor *b);
498 int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
499
500 /*-----------------------------------------------------------------------*/
501 /* problem.c: */
502 enum {
503 /* a problem that cannot be solved */
504 PROBLEM_UNSOLVABLE,
505
506 PROBLEM_DFT,
507 PROBLEM_RDFT,
508 PROBLEM_RDFT2,
509
510 /* for mpi/ subdirectory */
511 PROBLEM_MPI_DFT,
512 PROBLEM_MPI_RDFT,
513 PROBLEM_MPI_RDFT2,
514 PROBLEM_MPI_TRANSPOSE,
515
516 PROBLEM_LAST
517 };
518
519 typedef struct {
520 int problem_kind;
521 void (*hash) (const problem *ego, md5 *p);
522 void (*zero) (const problem *ego);
523 void (*print) (const problem *ego, printer *p);
524 void (*destroy) (problem *ego);
525 } problem_adt;
526
527 struct problem_s {
528 const problem_adt *adt;
529 };
530
531 problem *X(mkproblem)(size_t sz, const problem_adt *adt);
532 void X(problem_destroy)(problem *ego);
533 problem *X(mkproblem_unsolvable)(void);
534
535 /*-----------------------------------------------------------------------*/
536 /* print.c */
537 struct printer_s {
538 void (*print)(printer *p, const char *format, ...);
539 void (*vprint)(printer *p, const char *format, va_list ap);
540 void (*putchr)(printer *p, char c);
541 void (*cleanup)(printer *p);
542 int indent;
543 int indent_incr;
544 };
545
546 printer *X(mkprinter)(size_t size,
547 void (*putchr)(printer *p, char c),
548 void (*cleanup)(printer *p));
549 IFFTW_EXTERN void X(printer_destroy)(printer *p);
550
551 /*-----------------------------------------------------------------------*/
552 /* scan.c */
553 struct scanner_s {
554 int (*scan)(scanner *sc, const char *format, ...);
555 int (*vscan)(scanner *sc, const char *format, va_list ap);
556 int (*getchr)(scanner *sc);
557 int ungotc;
558 };
559
560 scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
561 void X(scanner_destroy)(scanner *sc);
562
563 /*-----------------------------------------------------------------------*/
564 /* plan.c: */
565
566 enum wakefulness {
567 SLEEPY,
568 AWAKE_ZERO,
569 AWAKE_SQRTN_TABLE,
570 AWAKE_SINCOS
571 };
572
573 typedef struct {
574 void (*solve)(const plan *ego, const problem *p);
575 void (*awake)(plan *ego, enum wakefulness wakefulness);
576 void (*print)(const plan *ego, printer *p);
577 void (*destroy)(plan *ego);
578 } plan_adt;
579
580 struct plan_s {
581 const plan_adt *adt;
582 opcnt ops;
583 double pcost;
584 enum wakefulness wakefulness; /* used for debugging only */
585 int could_prune_now_p;
586 };
587
588 plan *X(mkplan)(size_t size, const plan_adt *adt);
589 void X(plan_destroy_internal)(plan *ego);
590 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
591 void X(plan_null_destroy)(plan *ego);
592
593 /*-----------------------------------------------------------------------*/
594 /* solver.c: */
595 typedef struct {
596 int problem_kind;
597 plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
598 void (*destroy)(solver *ego);
599 } solver_adt;
600
601 struct solver_s {
602 const solver_adt *adt;
603 int refcnt;
604 };
605
606 solver *X(mksolver)(size_t size, const solver_adt *adt);
607 void X(solver_use)(solver *ego);
608 void X(solver_destroy)(solver *ego);
609 void X(solver_register)(planner *plnr, solver *s);
610
611 /* shorthand */
612 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
613
614 /*-----------------------------------------------------------------------*/
615 /* planner.c */
616
617 typedef struct slvdesc_s {
618 solver *slv;
619 const char *reg_nam;
620 unsigned nam_hash;
621 int reg_id;
622 int next_for_same_problem_kind;
623 } slvdesc;
624
625 typedef struct solution_s solution; /* opaque */
626
627 /* interpretation of L and U:
628
629 - if it returns a plan, the planner guarantees that all applicable
630 plans at least as impatient as U have been tried, and that each
631 plan in the solution is at least as impatient as L.
632
633 - if it returns 0, the planner guarantees to have tried all solvers
634 at least as impatient as L, and that none of them was applicable.
635
636 The structure is packed to fit into 64 bits.
637 */
638
639 typedef struct {
640 unsigned l:20;
641 unsigned hash_info:3;
642 # define BITS_FOR_TIMELIMIT 9
643 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
644 unsigned u:20;
645
646 /* abstraction break: we store the solver here to pad the
647 structure to 64 bits. Otherwise, the struct is padded to 64
648 bits anyway, and another word is allocated for slvndx. */
649 # define BITS_FOR_SLVNDX 12
650 unsigned slvndx:BITS_FOR_SLVNDX;
651 } flags_t;
652
653 /* impatience flags */
654 enum {
655 BELIEVE_PCOST = 0x0001,
656 ESTIMATE = 0x0002,
657 NO_DFT_R2HC = 0x0004,
658 NO_SLOW = 0x0008,
659 NO_VRECURSE = 0x0010,
660 NO_INDIRECT_OP = 0x0020,
661 NO_LARGE_GENERIC = 0x0040,
662 NO_RANK_SPLITS = 0x0080,
663 NO_VRANK_SPLITS = 0x0100,
664 NO_NONTHREADED = 0x0200,
665 NO_BUFFERING = 0x0400,
666 NO_FIXED_RADIX_LARGE_N = 0x0800,
667 NO_DESTROY_INPUT = 0x1000,
668 NO_SIMD = 0x2000,
669 CONSERVE_MEMORY = 0x4000,
670 NO_DHT_R2HC = 0x8000,
671 NO_UGLY = 0x10000,
672 ALLOW_PRUNING = 0x20000
673 };
674
675 /* hashtable information */
676 enum {
677 BLESSING = 0x1u, /* save this entry */
678 H_VALID = 0x2u, /* valid hastable entry */
679 H_LIVE = 0x4u /* entry is nonempty, implies H_VALID */
680 };
681
682 #define PLNR_L(plnr) ((plnr)->flags.l)
683 #define PLNR_U(plnr) ((plnr)->flags.u)
684 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
685
686 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
687 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
688 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
689
690 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
691 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
692 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
693 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
694 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
695 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
696 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
697 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
698 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
699 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
700 #define NO_NONTHREADEDP(plnr) \
701 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
702
703 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
704 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
705 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
706 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
707 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
708
709 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
710
711 typedef enum {
712 /* WISDOM_NORMAL: planner may or may not use wisdom */
713 WISDOM_NORMAL,
714
715 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
716 WISDOM_ONLY,
717
718 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
719 WISDOM_IS_BOGUS,
720
721 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
722 WISDOM_IGNORE_INFEASIBLE,
723
724 /* WISDOM_IGNORE_ALL: planner ignores all */
725 WISDOM_IGNORE_ALL
726 } wisdom_state_t;
727
728 typedef struct {
729 void (*register_solver)(planner *ego, solver *s);
730 plan *(*mkplan)(planner *ego, const problem *p);
731 void (*forget)(planner *ego, amnesia a);
732 void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
733 word in C++. */
734 int (*imprt)(planner *ego, scanner *sc);
735 } planner_adt;
736
737 /* hash table of solutions */
738 typedef struct {
739 solution *solutions;
740 unsigned hashsiz, nelem;
741
742 /* statistics */
743 int lookup, succ_lookup, lookup_iter;
744 int insert, insert_iter, insert_unknown;
745 int nrehash;
746 } hashtab;
747
748 typedef enum { COST_SUM, COST_MAX } cost_kind;
749
750 struct planner_s {
751 const planner_adt *adt;
752 void (*hook)(struct planner_s *plnr, plan *pln,
753 const problem *p, int optimalp);
754 double (*cost_hook)(const problem *p, double t, cost_kind k);
755 int (*wisdom_ok_hook)(const problem *p, flags_t flags);
756 void (*nowisdom_hook)(const problem *p);
757 wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
758
759 /* solver descriptors */
760 slvdesc *slvdescs;
761 unsigned nslvdesc, slvdescsiz;
762 const char *cur_reg_nam;
763 int cur_reg_id;
764 int slvdescs_for_problem_kind[PROBLEM_LAST];
765
766 wisdom_state_t wisdom_state;
767
768 hashtab htab_blessed;
769 hashtab htab_unblessed;
770
771 int nthr;
772 flags_t flags;
773
774 crude_time start_time;
775 double timelimit; /* elapsed_since(start_time) at which to bail out */
776 int timed_out; /* whether most recent search timed out */
777 int need_timeout_check;
778
779 /* various statistics */
780 int nplan; /* number of plans evaluated */
781 double pcost, epcost; /* total pcost of measured/estimated plans */
782 int nprob; /* number of problems evaluated */
783 };
784
785 planner *X(mkplanner)(void);
786 void X(planner_destroy)(planner *ego);
787
788 /*
789 Iterate over all solvers. Read:
790
791 @article{ baker93iterators,
792 author = "Henry G. Baker, Jr.",
793 title = "Iterators: Signs of Weakness in Object-Oriented Languages",
794 journal = "{ACM} {OOPS} Messenger",
795 volume = "4",
796 number = "3",
797 pages = "18--25"
798 }
799 */
800 #define FORALL_SOLVERS(ego, s, p, what) \
801 { \
802 unsigned _cnt; \
803 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
804 slvdesc *p = ego->slvdescs + _cnt; \
805 solver *s = p->slv; \
806 what; \
807 } \
808 }
809
810 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
811 { \
812 int _cnt = ego->slvdescs_for_problem_kind[kind]; \
813 while (_cnt >= 0) { \
814 slvdesc *p = ego->slvdescs + _cnt; \
815 solver *s = p->slv; \
816 what; \
817 _cnt = p->next_for_same_problem_kind; \
818 } \
819 }
820
821
822 /* make plan, destroy problem */
823 plan *X(mkplan_d)(planner *ego, problem *p);
824 plan *X(mkplan_f_d)(planner *ego, problem *p,
825 unsigned l_set, unsigned u_set, unsigned u_reset);
826
827 /*-----------------------------------------------------------------------*/
828 /* stride.c: */
829
830 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
831 #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
832 #define PRECOMPUTE_ARRAY_INDICES
833 #endif
834
835 extern const INT X(an_INT_guaranteed_to_be_zero);
836
837 #ifdef PRECOMPUTE_ARRAY_INDICES
838 typedef INT *stride;
839 #define WS(stride, i) (stride[i])
840 extern stride X(mkstride)(INT n, INT s);
841 void X(stride_destroy)(stride p);
842 /* hackery to prevent the compiler from copying the strides array
843 onto the stack */
844 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
845 #else
846
847 typedef INT stride;
848 #define WS(stride, i) (stride * i)
849 #define fftwf_mkstride(n, stride) stride
850 #define fftw_mkstride(n, stride) stride
851 #define fftwl_mkstride(n, stride) stride
852 #define fftwf_stride_destroy(p) ((void) p)
853 #define fftw_stride_destroy(p) ((void) p)
854 #define fftwl_stride_destroy(p) ((void) p)
855
856 /* hackery to prevent the compiler from ``optimizing'' induction
857 variables in codelet loops. The problem is that for each K and for
858 each expression of the form P[I + STRIDE * K] in a loop, most
859 compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
860 For large values of K this behavior overflows the
861 register set, which is likely worse than doing the index computation
862 in the first place.
863
864 If we guess that there are more than
865 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
866 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
867 be 0, but the compiler does not know this.
868
869 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
870 seem to imply.
871 */
872 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
873 #define MAKE_VOLATILE_STRIDE(nptr, x) \
874 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
875 0 : \
876 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
877 #endif /* PRECOMPUTE_ARRAY_INDICES */
878
879 /*-----------------------------------------------------------------------*/
880 /* solvtab.c */
881
882 struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
883 typedef struct solvtab_s solvtab[];
884 void X(solvtab_exec)(const solvtab tbl, planner *p);
885 #define SOLVTAB(s) { s, STRINGIZE(s) }
886 #define SOLVTAB_END { 0, 0 }
887
888 /*-----------------------------------------------------------------------*/
889 /* pickdim.c */
890 int X(pickdim)(int which_dim, const int *buddies, size_t nbuddies,
891 const tensor *sz, int oop, int *dp);
892
893 /*-----------------------------------------------------------------------*/
894 /* twiddle.c */
895 /* little language to express twiddle factors computation */
896 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
897 TW_FULL = 4, TW_HALF = 5 };
898
899 typedef struct {
900 unsigned char op;
901 signed char v;
902 short i;
903 } tw_instr;
904
905 typedef struct twid_s {
906 R *W; /* array of twiddle factors */
907 INT n, r, m; /* transform order, radix, # twiddle rows */
908 int refcnt;
909 const tw_instr *instr;
910 struct twid_s *cdr;
911 enum wakefulness wakefulness;
912 } twid;
913
914 INT X(twiddle_length)(INT r, const tw_instr *p);
915 void X(twiddle_awake)(enum wakefulness wakefulness,
916 twid **pp, const tw_instr *instr, INT n, INT r, INT m);
917
918 /*-----------------------------------------------------------------------*/
919 /* trig.c */
920 #if defined(TRIGREAL_IS_LONG_DOUBLE)
921 typedef long double trigreal;
922 #elif defined(TRIGREAL_IS_QUAD)
923 typedef __float128 trigreal;
924 #else
925 typedef double trigreal;
926 #endif
927
928 typedef struct triggen_s triggen;
929
930 struct triggen_s {
931 void (*cexp)(triggen *t, INT m, R *result);
932 void (*cexpl)(triggen *t, INT m, trigreal *result);
933 void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
934
935 INT twshft;
936 INT twradix;
937 INT twmsk;
938 trigreal *W0, *W1;
939 INT n;
940 };
941
942 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
943 void X(triggen_destroy)(triggen *p);
944
945 /*-----------------------------------------------------------------------*/
946 /* primes.c: */
947
948 #define MULMOD(x, y, p) \
949 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
950
951 INT X(safe_mulmod)(INT x, INT y, INT p);
952 INT X(power_mod)(INT n, INT m, INT p);
953 INT X(find_generator)(INT p);
954 INT X(first_divisor)(INT n);
955 int X(is_prime)(INT n);
956 INT X(next_prime)(INT n);
957 int X(factors_into)(INT n, const INT *primes);
958 int X(factors_into_small_primes)(INT n);
959 INT X(choose_radix)(INT r, INT n);
960 INT X(isqrt)(INT n);
961 INT X(modulo)(INT a, INT n);
962
963 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
964
965 /* thresholds below which certain solvers are considered SLOW. These are guesses
966 believed to be conservative */
967 #define GENERIC_MAX_SLOW 16
968 #define RADER_MAX_SLOW 32
969 #define BLUESTEIN_MAX_SLOW 24
970
971 /*-----------------------------------------------------------------------*/
972 /* rader.c: */
973 typedef struct rader_tls rader_tl;
974
975 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
976 R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
977 void X(rader_tl_delete)(R *W, rader_tl **tl);
978
979 /*-----------------------------------------------------------------------*/
980 /* copy/transposition routines */
981
982 /* lower bound to the cache size, for tiled routines */
983 #define CACHESIZE 8192
984
985 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
986
987 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
988 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
989 void *args);
990 void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
991 void X(zero1d_pair)(R *O0, R *O1, INT n0, INT os0);
992 void X(cpy2d)(R *I, R *O,
993 INT n0, INT is0, INT os0,
994 INT n1, INT is1, INT os1,
995 INT vl);
996 void X(cpy2d_ci)(R *I, R *O,
997 INT n0, INT is0, INT os0,
998 INT n1, INT is1, INT os1,
999 INT vl);
1000 void X(cpy2d_co)(R *I, R *O,
1001 INT n0, INT is0, INT os0,
1002 INT n1, INT is1, INT os1,
1003 INT vl);
1004 void X(cpy2d_tiled)(R *I, R *O,
1005 INT n0, INT is0, INT os0,
1006 INT n1, INT is1, INT os1,
1007 INT vl);
1008 void X(cpy2d_tiledbuf)(R *I, R *O,
1009 INT n0, INT is0, INT os0,
1010 INT n1, INT is1, INT os1,
1011 INT vl);
1012 void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
1013 INT n0, INT is0, INT os0,
1014 INT n1, INT is1, INT os1);
1015 void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
1016 INT n0, INT is0, INT os0,
1017 INT n1, INT is1, INT os1);
1018 void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
1019 INT n0, INT is0, INT os0,
1020 INT n1, INT is1, INT os1);
1021
1022 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
1023 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
1024 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
1025
1026 typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
1027 typedef void (*cpy2d_func)(R *I, R *O,
1028 INT n0, INT is0, INT os0,
1029 INT n1, INT is1, INT os1,
1030 INT vl);
1031
1032 /*-----------------------------------------------------------------------*/
1033 /* misc stuff */
1034 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
1035 double X(iestimate_cost)(const planner *, const plan *, const problem *);
1036
1037 #ifdef FFTW_RANDOM_ESTIMATOR
1038 extern unsigned X(random_estimate_seed);
1039 #endif
1040
1041 double X(measure_execution_time)(const planner *plnr,
1042 plan *pln, const problem *p);
1043 IFFTW_EXTERN int X(ialignment_of)(R *p);
1044 unsigned X(hash)(const char *s);
1045 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
1046 int X(nbuf_redundant)(INT n, INT vl, size_t which,
1047 const INT *maxnbuf, size_t nmaxnbuf);
1048 INT X(bufdist)(INT n, INT vl);
1049 int X(toobig)(INT n);
1050 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
1051
1052 #if HAVE_SIMD
1053 R *X(taint)(R *p, INT s);
1054 R *X(join_taint)(R *p1, R *p2);
1055 #define TAINT(p, s) X(taint)(p, s)
1056 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
1057 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
1058 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
1059 #else
1060 #define TAINT(p, s) (p)
1061 #define UNTAINT(p) (p)
1062 #define TAINTOF(p) 0
1063 #define JOIN_TAINT(p1, p2) p1
1064 #endif
1065
1066 #ifdef FFTW_DEBUG_ALIGNMENT
1067 # define ASSERT_ALIGNED_DOUBLE { \
1068 double __foo; \
1069 CK(!(((uintptr_t) &__foo) & 0x7)); \
1070 }
1071 #else
1072 # define ASSERT_ALIGNED_DOUBLE
1073 #endif /* FFTW_DEBUG_ALIGNMENT */
1074
1075
1076
1077 /*-----------------------------------------------------------------------*/
1078 /* macros used in codelets to reduce source code size */
1079
1080 typedef R E; /* internal precision of codelets. */
1081
1082 #if defined(FFTW_LDOUBLE)
1083 # define K(x) ((E) x##L)
1084 #elif defined(FFTW_QUAD)
1085 # define K(x) ((E) x##Q)
1086 #else
1087 # define K(x) ((E) x)
1088 #endif
1089 #define DK(name, value) const E name = K(value)
1090
1091 /* FMA macros */
1092
1093 #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
1094 /* The obvious expression a * b + c does not work. If both x = a * b
1095 + c and y = a * b - c appear in the source, gcc computes t = a * b,
1096 x = t + c, y = t - c, thus destroying the fma.
1097
1098 This peculiar coding seems to do the right thing on all of
1099 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
1100 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
1101 `x' for the single-assignment form).
1102
1103 However, gcc-4.0 is a formidable adversary which succeeds in
1104 pessimizing two fma's into one multiplication and two additions.
1105 It does it very early in the game---before the optimization passes
1106 even start. The only real workaround seems to use fake inline asm
1107 such as
1108
1109 asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
1110 return a * b + c;
1111
1112 in each of the FMA, FMS, FNMA, and FNMS functions. However, this
1113 does not solve the problem either, because two equal asm statements
1114 count as a common subexpression! One must use *different* fake asm
1115 statements:
1116
1117 in FMA:
1118 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
1119
1120 in FMS:
1121 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
1122
1123 etc.
1124
1125 After these changes, gcc recalcitrantly generates the fma that was
1126 in the source to begin with. However, the extra asm() cruft
1127 confuses other passes of gcc, notably the instruction scheduler.
1128 (Of course, one could also generate the fma directly via inline
1129 asm, but this confuses the scheduler even more.)
1130
1131 Steven and I have submitted more than one bug report to the gcc
1132 mailing list over the past few years, to no effect. Thus, I give
1133 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
1134 out before touching this crap again.
1135 */
1136 static __inline__ E FMA(E a, E b, E c)
1137 {
1138 E x = a * b;
1139 x = x + c;
1140 return x;
1141 }
1142
1143 static __inline__ E FMS(E a, E b, E c)
1144 {
1145 E x = a * b;
1146 x = x - c;
1147 return x;
1148 }
1149
1150 static __inline__ E FNMA(E a, E b, E c)
1151 {
1152 E x = a * b;
1153 x = - (x + c);
1154 return x;
1155 }
1156
1157 static __inline__ E FNMS(E a, E b, E c)
1158 {
1159 E x = a * b;
1160 x = - (x - c);
1161 return x;
1162 }
1163 #else
1164 #define FMA(a, b, c) (((a) * (b)) + (c))
1165 #define FMS(a, b, c) (((a) * (b)) - (c))
1166 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
1167 #define FNMS(a, b, c) ((c) - ((a) * (b)))
1168 #endif
1169
1170 #ifdef __cplusplus
1171 } /* extern "C" */
1172 #endif /* __cplusplus */
1173
1174 #endif /* __IFFTW_H__ */