comparison src/fftw-3.3.3/kernel/ifftw.h @ 95:89f5e221ed7b

Add FFTW3
author Chris Cannam <cannam@all-day-breakfast.com>
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
94:d278df1123f9 95:89f5e221ed7b
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21
22 /* FFTW internal header file */
23 #ifndef __IFFTW_H__
24 #define __IFFTW_H__
25
26 #include "config.h"
27
28 #include <stdlib.h> /* size_t */
29 #include <stdarg.h> /* va_list */
30 #include <stddef.h> /* ptrdiff_t */
31
32 #if HAVE_SYS_TYPES_H
33 # include <sys/types.h>
34 #endif
35
36 #if HAVE_STDINT_H
37 # include <stdint.h> /* uintptr_t, maybe */
38 #endif
39
40 #if HAVE_INTTYPES_H
41 # include <inttypes.h> /* uintptr_t, maybe */
42 #endif
43
44 #ifdef __cplusplus
45 extern "C"
46 {
47 #endif /* __cplusplus */
48
49 /* Windows annoyances -- since tests/hook.c uses some internal
50 FFTW functions, we need to given them the dllexport attribute
51 under Windows when compiling as a DLL (see api/fftw3.h). */
52 #if defined(FFTW_EXTERN)
53 # define IFFTW_EXTERN FFTW_EXTERN
54 #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
55 && (defined(_WIN32) || defined(__WIN32__))
56 # define IFFTW_EXTERN extern __declspec(dllexport)
57 #else
58 # define IFFTW_EXTERN extern
59 #endif
60
61 /* determine precision and name-mangling scheme */
62 #define CONCAT(prefix, name) prefix ## name
63 #if defined(FFTW_SINGLE)
64 typedef float R;
65 # define X(name) CONCAT(fftwf_, name)
66 #elif defined(FFTW_LDOUBLE)
67 typedef long double R;
68 # define X(name) CONCAT(fftwl_, name)
69 # define TRIGREAL_IS_LONG_DOUBLE
70 #elif defined(FFTW_QUAD)
71 typedef __float128 R;
72 # define X(name) CONCAT(fftwq_, name)
73 # define TRIGREAL_IS_QUAD
74 #else
75 typedef double R;
76 # define X(name) CONCAT(fftw_, name)
77 #endif
78
79 /*
80 integral type large enough to contain a stride (what ``int'' should
81 have been in the first place.
82 */
83 typedef ptrdiff_t INT;
84
85 /* dummy use of unused parameters to silence compiler warnings */
86 #define UNUSED(x) (void)x
87
88 #define NELEM(array) ((int) (sizeof(array) / sizeof((array)[0])))
89
90 #define FFT_SIGN (-1) /* sign convention for forward transforms */
91 extern void X(extract_reim)(int sign, R *c, R **r, R **i);
92
93 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
94
95 #define STRINGIZEx(x) #x
96 #define STRINGIZE(x) STRINGIZEx(x)
97 #define CIMPLIES(ante, post) (!(ante) || (post))
98
99 /* define HAVE_SIMD if any simd extensions are supported */
100 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || defined(HAVE_ALTIVEC) || \
101 defined(HAVE_MIPS_PS) || defined(HAVE_AVX)
102 #define HAVE_SIMD 1
103 #else
104 #define HAVE_SIMD 0
105 #endif
106
107 extern int X(have_simd_sse2)(void);
108 extern int X(have_simd_avx)(void);
109 extern int X(have_simd_altivec)(void);
110 extern int X(have_simd_neon)(void);
111
112 /* forward declarations */
113 typedef struct problem_s problem;
114 typedef struct plan_s plan;
115 typedef struct solver_s solver;
116 typedef struct planner_s planner;
117 typedef struct printer_s printer;
118 typedef struct scanner_s scanner;
119
120 /*-----------------------------------------------------------------------*/
121 /* alloca: */
122 #if HAVE_SIMD
123 # ifdef HAVE_AVX
124 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
125 * everything else */
126 # else
127 /* Note that we cannot use 32-byte alignment for all SIMD. For
128 example, MacOS X malloc is 16-byte aligned, but there was no
129 posix_memalign in MacOS X until version 10.6. */
130 # define MIN_ALIGNMENT 16
131 # endif
132 #endif
133
134 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
135 /* use alloca if available */
136
137 #ifndef alloca
138 #ifdef __GNUC__
139 # define alloca __builtin_alloca
140 #else
141 # ifdef _MSC_VER
142 # include <malloc.h>
143 # define alloca _alloca
144 # else
145 # if HAVE_ALLOCA_H
146 # include <alloca.h>
147 # else
148 # ifdef _AIX
149 #pragma alloca
150 # else
151 # ifndef alloca /* predefined by HP cc +Olibcalls */
152 void *alloca(size_t);
153 # endif
154 # endif
155 # endif
156 # endif
157 #endif
158 #endif
159
160 # ifdef MIN_ALIGNMENT
161 # define STACK_MALLOC(T, p, n) \
162 { \
163 p = (T)alloca((n) + MIN_ALIGNMENT); \
164 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
165 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \
166 }
167 # define STACK_FREE(n)
168 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
169 # define STACK_MALLOC(T, p, n) p = (T)alloca(n)
170 # define STACK_FREE(n)
171 # endif
172
173 #else /* ! HAVE_ALLOCA */
174 /* use malloc instead of alloca */
175 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
176 # define STACK_FREE(n) X(ifree)(n)
177 #endif /* ! HAVE_ALLOCA */
178
179 /* allocation of buffers. If these grow too large use malloc(), else
180 use STACK_MALLOC (hopefully reducing to alloca()). */
181
182 /* 64KiB ought to be enough for anybody */
183 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
184
185 #define BUF_ALLOC(T, p, n) \
186 { \
187 if (n < MAX_STACK_ALLOC) { \
188 STACK_MALLOC(T, p, n); \
189 } else { \
190 p = (T)MALLOC(n, BUFFERS); \
191 } \
192 }
193
194 #define BUF_FREE(p, n) \
195 { \
196 if (n < MAX_STACK_ALLOC) { \
197 STACK_FREE(p); \
198 } else { \
199 X(ifree)(p); \
200 } \
201 }
202
203 /*-----------------------------------------------------------------------*/
204 /* define uintptr_t if it is not already defined */
205
206 #ifndef HAVE_UINTPTR_T
207 # if SIZEOF_VOID_P == 0
208 # error sizeof void* is unknown!
209 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
210 typedef unsigned int uintptr_t;
211 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
212 typedef unsigned long uintptr_t;
213 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
214 typedef unsigned long long uintptr_t;
215 # else
216 # error no unsigned integer type matches void* sizeof!
217 # endif
218 #endif
219
220 /*-----------------------------------------------------------------------*/
221 /* We can do an optimization for copying pairs of (aligned) floats
222 when in single precision if 2*float = double. */
223
224 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
225 && SIZEOF_FLOAT != 0 \
226 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
227
228 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
229
230 /*-----------------------------------------------------------------------*/
231 /* assert.c: */
232 IFFTW_EXTERN void X(assertion_failed)(const char *s,
233 int line, const char *file);
234
235 /* always check */
236 #define CK(ex) \
237 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
238
239 #ifdef FFTW_DEBUG
240 /* check only if debug enabled */
241 #define A(ex) \
242 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
243 #else
244 #define A(ex) /* nothing */
245 #endif
246
247 extern void X(debug)(const char *format, ...);
248 #define D X(debug)
249
250 /*-----------------------------------------------------------------------*/
251 /* kalloc.c: */
252 extern void *X(kernel_malloc)(size_t n);
253 extern void X(kernel_free)(void *p);
254
255 /*-----------------------------------------------------------------------*/
256 /* alloc.c: */
257
258 /* objects allocated by malloc, for statistical purposes */
259 enum malloc_tag {
260 EVERYTHING,
261 PLANS,
262 SOLVERS,
263 PROBLEMS,
264 BUFFERS,
265 HASHT,
266 TENSORS,
267 PLANNERS,
268 SLVDESCS,
269 TWIDDLES,
270 STRIDES,
271 OTHER,
272 MALLOC_WHAT_LAST /* must be last */
273 };
274
275 IFFTW_EXTERN void X(ifree)(void *ptr);
276 extern void X(ifree0)(void *ptr);
277
278 #ifdef FFTW_DEBUG_MALLOC
279
280 IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what,
281 const char *file, int line);
282 #define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
283 IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose);
284
285 #else /* ! FFTW_DEBUG_MALLOC */
286
287 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
288 #define MALLOC(n, what) X(malloc_plain)(n)
289
290 #endif
291
292 #if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) || defined(HAVE_OPENMP))
293 extern int X(in_thread);
294 # define IN_THREAD X(in_thread)
295 # define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
296 # define THREAD_OFF X(in_thread) = in_thread_save; }
297 #else
298 # define IN_THREAD 0
299 # define THREAD_ON
300 # define THREAD_OFF
301 #endif
302
303 /*-----------------------------------------------------------------------*/
304 /* low-resolution clock */
305
306 #ifdef FAKE_CRUDE_TIME
307 typedef int crude_time;
308 #else
309 # if TIME_WITH_SYS_TIME
310 # include <sys/time.h>
311 # include <time.h>
312 # else
313 # if HAVE_SYS_TIME_H
314 # include <sys/time.h>
315 # else
316 # include <time.h>
317 # endif
318 # endif
319
320 # ifdef HAVE_BSDGETTIMEOFDAY
321 # ifndef HAVE_GETTIMEOFDAY
322 # define gettimeofday BSDgettimeofday
323 # define HAVE_GETTIMEOFDAY 1
324 # endif
325 # endif
326
327 # if defined(HAVE_GETTIMEOFDAY)
328 typedef struct timeval crude_time;
329 # else
330 typedef clock_t crude_time;
331 # endif
332 #endif /* else FAKE_CRUDE_TIME */
333
334 crude_time X(get_crude_time)(void);
335 double X(elapsed_since)(const planner *plnr, const problem *p,
336 crude_time t0); /* time in seconds since t0 */
337
338 /*-----------------------------------------------------------------------*/
339 /* ops.c: */
340 /*
341 * ops counter. The total number of additions is add + fma
342 * and the total number of multiplications is mul + fma.
343 * Total flops = add + mul + 2 * fma
344 */
345 typedef struct {
346 double add;
347 double mul;
348 double fma;
349 double other;
350 } opcnt;
351
352 void X(ops_zero)(opcnt *dst);
353 void X(ops_other)(INT o, opcnt *dst);
354 void X(ops_cpy)(const opcnt *src, opcnt *dst);
355
356 void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
357 void X(ops_add2)(const opcnt *a, opcnt *dst);
358
359 /* dst = m * a + b */
360 void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
361
362 /* dst += m * a */
363 void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
364
365
366 /*-----------------------------------------------------------------------*/
367 /* minmax.c: */
368 INT X(imax)(INT a, INT b);
369 INT X(imin)(INT a, INT b);
370
371 /*-----------------------------------------------------------------------*/
372 /* iabs.c: */
373 INT X(iabs)(INT a);
374
375 /* inline version */
376 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
377
378 /*-----------------------------------------------------------------------*/
379 /* md5.c */
380
381 #if SIZEOF_UNSIGNED_INT >= 4
382 typedef unsigned int md5uint;
383 #else
384 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
385 #endif
386
387 typedef md5uint md5sig[4];
388
389 typedef struct {
390 md5sig s; /* state and signature */
391
392 /* fields not meant to be used outside md5.c: */
393 unsigned char c[64]; /* stuff not yet processed */
394 unsigned l; /* total length. Should be 64 bits long, but this is
395 good enough for us */
396 } md5;
397
398 void X(md5begin)(md5 *p);
399 void X(md5putb)(md5 *p, const void *d_, size_t len);
400 void X(md5puts)(md5 *p, const char *s);
401 void X(md5putc)(md5 *p, unsigned char c);
402 void X(md5int)(md5 *p, int i);
403 void X(md5INT)(md5 *p, INT i);
404 void X(md5unsigned)(md5 *p, unsigned i);
405 void X(md5end)(md5 *p);
406
407 /*-----------------------------------------------------------------------*/
408 /* tensor.c: */
409 #define STRUCT_HACK_KR
410 #undef STRUCT_HACK_C99
411
412 typedef struct {
413 INT n;
414 INT is; /* input stride */
415 INT os; /* output stride */
416 } iodim;
417
418 typedef struct {
419 int rnk;
420 #if defined(STRUCT_HACK_KR)
421 iodim dims[1];
422 #elif defined(STRUCT_HACK_C99)
423 iodim dims[];
424 #else
425 iodim *dims;
426 #endif
427 } tensor;
428
429 /*
430 Definition of rank -infinity.
431 This definition has the property that if you want rank 0 or 1,
432 you can simply test for rank <= 1. This is a common case.
433
434 A tensor of rank -infinity has size 0.
435 */
436 #define RNK_MINFTY ((int)(((unsigned) -1) >> 1))
437 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
438
439 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
440
441 tensor *X(mktensor)(int rnk);
442 tensor *X(mktensor_0d)(void);
443 tensor *X(mktensor_1d)(INT n, INT is, INT os);
444 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
445 INT n1, INT is1, INT os1);
446 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
447 INT n1, INT is1, INT os1,
448 INT n2, INT is2, INT os2);
449 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
450 INT n1, INT is1, INT os1,
451 INT n2, INT is2, INT os2,
452 INT n3, INT is3, INT os3);
453 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
454 INT n1, INT is1, INT os1,
455 INT n2, INT is2, INT os2,
456 INT n3, INT is3, INT os3,
457 INT n4, INT is4, INT os4);
458 INT X(tensor_sz)(const tensor *sz);
459 void X(tensor_md5)(md5 *p, const tensor *t);
460 INT X(tensor_max_index)(const tensor *sz);
461 INT X(tensor_min_istride)(const tensor *sz);
462 INT X(tensor_min_ostride)(const tensor *sz);
463 INT X(tensor_min_stride)(const tensor *sz);
464 int X(tensor_inplace_strides)(const tensor *sz);
465 int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
466 int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
467 inplace_kind k);
468 tensor *X(tensor_copy)(const tensor *sz);
469 int X(tensor_kosherp)(const tensor *x);
470
471 tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
472 tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
473 tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
474 tensor *X(tensor_compress)(const tensor *sz);
475 tensor *X(tensor_compress_contiguous)(const tensor *sz);
476 tensor *X(tensor_append)(const tensor *a, const tensor *b);
477 void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
478 int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
479 void X(tensor_destroy)(tensor *sz);
480 void X(tensor_destroy2)(tensor *a, tensor *b);
481 void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
482 void X(tensor_print)(const tensor *sz, printer *p);
483 int X(dimcmp)(const iodim *a, const iodim *b);
484 int X(tensor_equal)(const tensor *a, const tensor *b);
485 int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
486
487 /*-----------------------------------------------------------------------*/
488 /* problem.c: */
489 enum {
490 /* a problem that cannot be solved */
491 PROBLEM_UNSOLVABLE,
492
493 PROBLEM_DFT,
494 PROBLEM_RDFT,
495 PROBLEM_RDFT2,
496
497 /* for mpi/ subdirectory */
498 PROBLEM_MPI_DFT,
499 PROBLEM_MPI_RDFT,
500 PROBLEM_MPI_RDFT2,
501 PROBLEM_MPI_TRANSPOSE,
502
503 PROBLEM_LAST
504 };
505
506 typedef struct {
507 int problem_kind;
508 void (*hash) (const problem *ego, md5 *p);
509 void (*zero) (const problem *ego);
510 void (*print) (const problem *ego, printer *p);
511 void (*destroy) (problem *ego);
512 } problem_adt;
513
514 struct problem_s {
515 const problem_adt *adt;
516 };
517
518 problem *X(mkproblem)(size_t sz, const problem_adt *adt);
519 void X(problem_destroy)(problem *ego);
520 problem *X(mkproblem_unsolvable)(void);
521
522 /*-----------------------------------------------------------------------*/
523 /* print.c */
524 struct printer_s {
525 void (*print)(printer *p, const char *format, ...);
526 void (*vprint)(printer *p, const char *format, va_list ap);
527 void (*putchr)(printer *p, char c);
528 void (*cleanup)(printer *p);
529 int indent;
530 int indent_incr;
531 };
532
533 printer *X(mkprinter)(size_t size,
534 void (*putchr)(printer *p, char c),
535 void (*cleanup)(printer *p));
536 IFFTW_EXTERN void X(printer_destroy)(printer *p);
537
538 /*-----------------------------------------------------------------------*/
539 /* scan.c */
540 struct scanner_s {
541 int (*scan)(scanner *sc, const char *format, ...);
542 int (*vscan)(scanner *sc, const char *format, va_list ap);
543 int (*getchr)(scanner *sc);
544 int ungotc;
545 };
546
547 scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
548 void X(scanner_destroy)(scanner *sc);
549
550 /*-----------------------------------------------------------------------*/
551 /* plan.c: */
552
553 enum wakefulness {
554 SLEEPY,
555 AWAKE_ZERO,
556 AWAKE_SQRTN_TABLE,
557 AWAKE_SINCOS
558 };
559
560 typedef struct {
561 void (*solve)(const plan *ego, const problem *p);
562 void (*awake)(plan *ego, enum wakefulness wakefulness);
563 void (*print)(const plan *ego, printer *p);
564 void (*destroy)(plan *ego);
565 } plan_adt;
566
567 struct plan_s {
568 const plan_adt *adt;
569 opcnt ops;
570 double pcost;
571 enum wakefulness wakefulness; /* used for debugging only */
572 int could_prune_now_p;
573 };
574
575 plan *X(mkplan)(size_t size, const plan_adt *adt);
576 void X(plan_destroy_internal)(plan *ego);
577 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
578 void X(plan_null_destroy)(plan *ego);
579
580 /*-----------------------------------------------------------------------*/
581 /* solver.c: */
582 typedef struct {
583 int problem_kind;
584 plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
585 void (*destroy)(solver *ego);
586 } solver_adt;
587
588 struct solver_s {
589 const solver_adt *adt;
590 int refcnt;
591 };
592
593 solver *X(mksolver)(size_t size, const solver_adt *adt);
594 void X(solver_use)(solver *ego);
595 void X(solver_destroy)(solver *ego);
596 void X(solver_register)(planner *plnr, solver *s);
597
598 /* shorthand */
599 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
600
601 /*-----------------------------------------------------------------------*/
602 /* planner.c */
603
604 typedef struct slvdesc_s {
605 solver *slv;
606 const char *reg_nam;
607 unsigned nam_hash;
608 int reg_id;
609 int next_for_same_problem_kind;
610 } slvdesc;
611
612 typedef struct solution_s solution; /* opaque */
613
614 /* interpretation of L and U:
615
616 - if it returns a plan, the planner guarantees that all applicable
617 plans at least as impatient as U have been tried, and that each
618 plan in the solution is at least as impatient as L.
619
620 - if it returns 0, the planner guarantees to have tried all solvers
621 at least as impatient as L, and that none of them was applicable.
622
623 The structure is packed to fit into 64 bits.
624 */
625
626 typedef struct {
627 unsigned l:20;
628 unsigned hash_info:3;
629 # define BITS_FOR_TIMELIMIT 9
630 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
631 unsigned u:20;
632
633 /* abstraction break: we store the solver here to pad the
634 structure to 64 bits. Otherwise, the struct is padded to 64
635 bits anyway, and another word is allocated for slvndx. */
636 # define BITS_FOR_SLVNDX 12
637 unsigned slvndx:BITS_FOR_SLVNDX;
638 } flags_t;
639
640 /* impatience flags */
641 enum {
642 BELIEVE_PCOST = 0x0001,
643 ESTIMATE = 0x0002,
644 NO_DFT_R2HC = 0x0004,
645 NO_SLOW = 0x0008,
646 NO_VRECURSE = 0x0010,
647 NO_INDIRECT_OP = 0x0020,
648 NO_LARGE_GENERIC = 0x0040,
649 NO_RANK_SPLITS = 0x0080,
650 NO_VRANK_SPLITS = 0x0100,
651 NO_NONTHREADED = 0x0200,
652 NO_BUFFERING = 0x0400,
653 NO_FIXED_RADIX_LARGE_N = 0x0800,
654 NO_DESTROY_INPUT = 0x1000,
655 NO_SIMD = 0x2000,
656 CONSERVE_MEMORY = 0x4000,
657 NO_DHT_R2HC = 0x8000,
658 NO_UGLY = 0x10000,
659 ALLOW_PRUNING = 0x20000
660 };
661
662 /* hashtable information */
663 enum {
664 BLESSING = 0x1, /* save this entry */
665 H_VALID = 0x2, /* valid hastable entry */
666 H_LIVE = 0x4 /* entry is nonempty, implies H_VALID */
667 };
668
669 #define PLNR_L(plnr) ((plnr)->flags.l)
670 #define PLNR_U(plnr) ((plnr)->flags.u)
671 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
672
673 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
674 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
675 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
676
677 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
678 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
679 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
680 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
681 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
682 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
683 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
684 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
685 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
686 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
687 #define NO_NONTHREADEDP(plnr) \
688 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
689
690 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
691 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
692 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
693 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
694 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
695
696 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
697
698 typedef enum {
699 /* WISDOM_NORMAL: planner may or may not use wisdom */
700 WISDOM_NORMAL,
701
702 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
703 WISDOM_ONLY,
704
705 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
706 WISDOM_IS_BOGUS,
707
708 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
709 WISDOM_IGNORE_INFEASIBLE,
710
711 /* WISDOM_IGNORE_ALL: planner ignores all */
712 WISDOM_IGNORE_ALL
713 } wisdom_state_t;
714
715 typedef struct {
716 void (*register_solver)(planner *ego, solver *s);
717 plan *(*mkplan)(planner *ego, const problem *p);
718 void (*forget)(planner *ego, amnesia a);
719 void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
720 word in C++. */
721 int (*imprt)(planner *ego, scanner *sc);
722 } planner_adt;
723
724 /* hash table of solutions */
725 typedef struct {
726 solution *solutions;
727 unsigned hashsiz, nelem;
728
729 /* statistics */
730 int lookup, succ_lookup, lookup_iter;
731 int insert, insert_iter, insert_unknown;
732 int nrehash;
733 } hashtab;
734
735 typedef enum { COST_SUM, COST_MAX } cost_kind;
736
737 struct planner_s {
738 const planner_adt *adt;
739 void (*hook)(struct planner_s *plnr, plan *pln,
740 const problem *p, int optimalp);
741 double (*cost_hook)(const problem *p, double t, cost_kind k);
742 int (*wisdom_ok_hook)(const problem *p, flags_t flags);
743 void (*nowisdom_hook)(const problem *p);
744 wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
745
746 /* solver descriptors */
747 slvdesc *slvdescs;
748 unsigned nslvdesc, slvdescsiz;
749 const char *cur_reg_nam;
750 int cur_reg_id;
751 int slvdescs_for_problem_kind[PROBLEM_LAST];
752
753 wisdom_state_t wisdom_state;
754
755 hashtab htab_blessed;
756 hashtab htab_unblessed;
757
758 int nthr;
759 flags_t flags;
760
761 crude_time start_time;
762 double timelimit; /* elapsed_since(start_time) at which to bail out */
763 int timed_out; /* whether most recent search timed out */
764 int need_timeout_check;
765
766 /* various statistics */
767 int nplan; /* number of plans evaluated */
768 double pcost, epcost; /* total pcost of measured/estimated plans */
769 int nprob; /* number of problems evaluated */
770 };
771
772 planner *X(mkplanner)(void);
773 void X(planner_destroy)(planner *ego);
774
775 /*
776 Iterate over all solvers. Read:
777
778 @article{ baker93iterators,
779 author = "Henry G. Baker, Jr.",
780 title = "Iterators: Signs of Weakness in Object-Oriented Languages",
781 journal = "{ACM} {OOPS} Messenger",
782 volume = "4",
783 number = "3",
784 pages = "18--25"
785 }
786 */
787 #define FORALL_SOLVERS(ego, s, p, what) \
788 { \
789 unsigned _cnt; \
790 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
791 slvdesc *p = ego->slvdescs + _cnt; \
792 solver *s = p->slv; \
793 what; \
794 } \
795 }
796
797 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
798 { \
799 int _cnt = ego->slvdescs_for_problem_kind[kind]; \
800 while (_cnt >= 0) { \
801 slvdesc *p = ego->slvdescs + _cnt; \
802 solver *s = p->slv; \
803 what; \
804 _cnt = p->next_for_same_problem_kind; \
805 } \
806 }
807
808
809 /* make plan, destroy problem */
810 plan *X(mkplan_d)(planner *ego, problem *p);
811 plan *X(mkplan_f_d)(planner *ego, problem *p,
812 unsigned l_set, unsigned u_set, unsigned u_reset);
813
814 /*-----------------------------------------------------------------------*/
815 /* stride.c: */
816
817 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
818 #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
819 #define PRECOMPUTE_ARRAY_INDICES
820 #endif
821
822 extern const INT X(an_INT_guaranteed_to_be_zero);
823
824 #ifdef PRECOMPUTE_ARRAY_INDICES
825 typedef INT *stride;
826 #define WS(stride, i) (stride[i])
827 extern stride X(mkstride)(INT n, INT s);
828 void X(stride_destroy)(stride p);
829 /* hackery to prevent the compiler from copying the strides array
830 onto the stack */
831 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
832 #else
833
834 typedef INT stride;
835 #define WS(stride, i) (stride * i)
836 #define fftwf_mkstride(n, stride) stride
837 #define fftw_mkstride(n, stride) stride
838 #define fftwl_mkstride(n, stride) stride
839 #define fftwf_stride_destroy(p) ((void) p)
840 #define fftw_stride_destroy(p) ((void) p)
841 #define fftwl_stride_destroy(p) ((void) p)
842
843 /* hackery to prevent the compiler from ``optimizing'' induction
844 variables in codelet loops. The problem is that for each K and for
845 each expression of the form P[I + STRIDE * K] in a loop, most
846 compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
847 For large values of K this behavior overflows the
848 register set, which is likely worse than doing the index computation
849 in the first place.
850
851 If we guess that there are more than
852 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
853 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
854 be 0, but the compiler does not know this.
855
856 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
857 seem to imply.
858 */
859 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
860 #define MAKE_VOLATILE_STRIDE(nptr, x) \
861 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
862 0 : \
863 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
864 #endif /* PRECOMPUTE_ARRAY_INDICES */
865
866 /*-----------------------------------------------------------------------*/
867 /* solvtab.c */
868
869 struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
870 typedef struct solvtab_s solvtab[];
871 void X(solvtab_exec)(const solvtab tbl, planner *p);
872 #define SOLVTAB(s) { s, STRINGIZE(s) }
873 #define SOLVTAB_END { 0, 0 }
874
875 /*-----------------------------------------------------------------------*/
876 /* pickdim.c */
877 int X(pickdim)(int which_dim, const int *buddies, int nbuddies,
878 const tensor *sz, int oop, int *dp);
879
880 /*-----------------------------------------------------------------------*/
881 /* twiddle.c */
882 /* little language to express twiddle factors computation */
883 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
884 TW_FULL = 4, TW_HALF = 5 };
885
886 typedef struct {
887 unsigned char op;
888 signed char v;
889 short i;
890 } tw_instr;
891
892 typedef struct twid_s {
893 R *W; /* array of twiddle factors */
894 INT n, r, m; /* transform order, radix, # twiddle rows */
895 int refcnt;
896 const tw_instr *instr;
897 struct twid_s *cdr;
898 enum wakefulness wakefulness;
899 } twid;
900
901 INT X(twiddle_length)(INT r, const tw_instr *p);
902 void X(twiddle_awake)(enum wakefulness wakefulness,
903 twid **pp, const tw_instr *instr, INT n, INT r, INT m);
904
905 /*-----------------------------------------------------------------------*/
906 /* trig.c */
907 #if defined(TRIGREAL_IS_LONG_DOUBLE)
908 typedef long double trigreal;
909 #elif defined(TRIGREAL_IS_QUAD)
910 typedef __float128 trigreal;
911 #else
912 typedef double trigreal;
913 #endif
914
915 typedef struct triggen_s triggen;
916
917 struct triggen_s {
918 void (*cexp)(triggen *t, INT m, R *result);
919 void (*cexpl)(triggen *t, INT m, trigreal *result);
920 void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
921
922 INT twshft;
923 INT twradix;
924 INT twmsk;
925 trigreal *W0, *W1;
926 INT n;
927 };
928
929 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
930 void X(triggen_destroy)(triggen *p);
931
932 /*-----------------------------------------------------------------------*/
933 /* primes.c: */
934
935 #define MULMOD(x, y, p) \
936 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
937
938 INT X(safe_mulmod)(INT x, INT y, INT p);
939 INT X(power_mod)(INT n, INT m, INT p);
940 INT X(find_generator)(INT p);
941 INT X(first_divisor)(INT n);
942 int X(is_prime)(INT n);
943 INT X(next_prime)(INT n);
944 int X(factors_into)(INT n, const INT *primes);
945 int X(factors_into_small_primes)(INT n);
946 INT X(choose_radix)(INT r, INT n);
947 INT X(isqrt)(INT n);
948 INT X(modulo)(INT a, INT n);
949
950 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
951
952 /* thresholds below which certain solvers are considered SLOW. These are guesses
953 believed to be conservative */
954 #define GENERIC_MAX_SLOW 16
955 #define RADER_MAX_SLOW 32
956 #define BLUESTEIN_MAX_SLOW 24
957
958 /*-----------------------------------------------------------------------*/
959 /* rader.c: */
960 typedef struct rader_tls rader_tl;
961
962 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
963 R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
964 void X(rader_tl_delete)(R *W, rader_tl **tl);
965
966 /*-----------------------------------------------------------------------*/
967 /* copy/transposition routines */
968
969 /* lower bound to the cache size, for tiled routines */
970 #define CACHESIZE 8192
971
972 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
973
974 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
975 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
976 void *args);
977 void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
978 void X(cpy2d)(R *I, R *O,
979 INT n0, INT is0, INT os0,
980 INT n1, INT is1, INT os1,
981 INT vl);
982 void X(cpy2d_ci)(R *I, R *O,
983 INT n0, INT is0, INT os0,
984 INT n1, INT is1, INT os1,
985 INT vl);
986 void X(cpy2d_co)(R *I, R *O,
987 INT n0, INT is0, INT os0,
988 INT n1, INT is1, INT os1,
989 INT vl);
990 void X(cpy2d_tiled)(R *I, R *O,
991 INT n0, INT is0, INT os0,
992 INT n1, INT is1, INT os1,
993 INT vl);
994 void X(cpy2d_tiledbuf)(R *I, R *O,
995 INT n0, INT is0, INT os0,
996 INT n1, INT is1, INT os1,
997 INT vl);
998 void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
999 INT n0, INT is0, INT os0,
1000 INT n1, INT is1, INT os1);
1001 void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
1002 INT n0, INT is0, INT os0,
1003 INT n1, INT is1, INT os1);
1004 void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
1005 INT n0, INT is0, INT os0,
1006 INT n1, INT is1, INT os1);
1007
1008 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
1009 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
1010 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
1011
1012 typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
1013 typedef void (*cpy2d_func)(R *I, R *O,
1014 INT n0, INT is0, INT os0,
1015 INT n1, INT is1, INT os1,
1016 INT vl);
1017
1018 /*-----------------------------------------------------------------------*/
1019 /* misc stuff */
1020 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
1021 double X(iestimate_cost)(const planner *, const plan *, const problem *);
1022
1023 #ifdef FFTW_RANDOM_ESTIMATOR
1024 extern unsigned X(random_estimate_seed);
1025 #endif
1026
1027 double X(measure_execution_time)(const planner *plnr,
1028 plan *pln, const problem *p);
1029 int X(alignment_of)(R *p);
1030 unsigned X(hash)(const char *s);
1031 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
1032 int X(nbuf_redundant)(INT n, INT vl, int which,
1033 const INT *maxnbuf, int nmaxnbuf);
1034 INT X(bufdist)(INT n, INT vl);
1035 int X(toobig)(INT n);
1036 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
1037
1038 #if HAVE_SIMD
1039 R *X(taint)(R *p, INT s);
1040 R *X(join_taint)(R *p1, R *p2);
1041 #define TAINT(p, s) X(taint)(p, s)
1042 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
1043 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
1044 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
1045 #else
1046 #define TAINT(p, s) (p)
1047 #define UNTAINT(p) (p)
1048 #define TAINTOF(p) 0
1049 #define JOIN_TAINT(p1, p2) p1
1050 #endif
1051
1052 #ifdef FFTW_DEBUG_ALIGNMENT
1053 # define ASSERT_ALIGNED_DOUBLE { \
1054 double __foo; \
1055 CK(!(((uintptr_t) &__foo) & 0x7)); \
1056 }
1057 #else
1058 # define ASSERT_ALIGNED_DOUBLE
1059 #endif /* FFTW_DEBUG_ALIGNMENT */
1060
1061
1062
1063 /*-----------------------------------------------------------------------*/
1064 /* macros used in codelets to reduce source code size */
1065
1066 typedef R E; /* internal precision of codelets. */
1067
1068 #if defined(FFTW_LDOUBLE)
1069 # define K(x) ((E) x##L)
1070 #elif defined(FFTW_QUAD)
1071 # define K(x) ((E) x##Q)
1072 #else
1073 # define K(x) ((E) x)
1074 #endif
1075 #define DK(name, value) const E name = K(value)
1076
1077 /* FMA macros */
1078
1079 #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
1080 /* The obvious expression a * b + c does not work. If both x = a * b
1081 + c and y = a * b - c appear in the source, gcc computes t = a * b,
1082 x = t + c, y = t - c, thus destroying the fma.
1083
1084 This peculiar coding seems to do the right thing on all of
1085 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
1086 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
1087 `x' for the single-assignment form).
1088
1089 However, gcc-4.0 is a formidable adversary which succeeds in
1090 pessimizing two fma's into one multiplication and two additions.
1091 It does it very early in the game---before the optimization passes
1092 even start. The only real workaround seems to use fake inline asm
1093 such as
1094
1095 asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
1096 return a * b + c;
1097
1098 in each of the FMA, FMS, FNMA, and FNMS functions. However, this
1099 does not solve the problem either, because two equal asm statements
1100 count as a common subexpression! One must use *different* fake asm
1101 statements:
1102
1103 in FMA:
1104 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
1105
1106 in FMS:
1107 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
1108
1109 etc.
1110
1111 After these changes, gcc recalcitrantly generates the fma that was
1112 in the source to begin with. However, the extra asm() cruft
1113 confuses other passes of gcc, notably the instruction scheduler.
1114 (Of course, one could also generate the fma directly via inline
1115 asm, but this confuses the scheduler even more.)
1116
1117 Steven and I have submitted more than one bug report to the gcc
1118 mailing list over the past few years, to no effect. Thus, I give
1119 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
1120 out before touching this crap again.
1121 */
1122 static __inline__ E FMA(E a, E b, E c)
1123 {
1124 E x = a * b;
1125 x = x + c;
1126 return x;
1127 }
1128
1129 static __inline__ E FMS(E a, E b, E c)
1130 {
1131 E x = a * b;
1132 x = x - c;
1133 return x;
1134 }
1135
1136 static __inline__ E FNMA(E a, E b, E c)
1137 {
1138 E x = a * b;
1139 x = - (x + c);
1140 return x;
1141 }
1142
1143 static __inline__ E FNMS(E a, E b, E c)
1144 {
1145 E x = a * b;
1146 x = - (x - c);
1147 return x;
1148 }
1149 #else
1150 #define FMA(a, b, c) (((a) * (b)) + (c))
1151 #define FMS(a, b, c) (((a) * (b)) - (c))
1152 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
1153 #define FNMS(a, b, c) ((c) - ((a) * (b)))
1154 #endif
1155
1156 #ifdef __cplusplus
1157 } /* extern "C" */
1158 #endif /* __cplusplus */
1159
1160 #endif /* __IFFTW_H__ */