Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.5/kernel/ifftw.h @ 127:7867fa7e1b6b
Current fftw source
author | Chris Cannam <cannam@all-day-breakfast.com> |
---|---|
date | Tue, 18 Oct 2016 13:40:26 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
126:4a7071416412 | 127:7867fa7e1b6b |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-14 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 | |
22 /* FFTW internal header file */ | |
23 #ifndef __IFFTW_H__ | |
24 #define __IFFTW_H__ | |
25 | |
26 #include "config.h" | |
27 | |
28 #include <stdlib.h> /* size_t */ | |
29 #include <stdarg.h> /* va_list */ | |
30 #include <stddef.h> /* ptrdiff_t */ | |
31 #include <limits.h> /* INT_MAX */ | |
32 | |
33 #if HAVE_SYS_TYPES_H | |
34 # include <sys/types.h> | |
35 #endif | |
36 | |
37 #if HAVE_STDINT_H | |
38 # include <stdint.h> /* uintptr_t, maybe */ | |
39 #endif | |
40 | |
41 #if HAVE_INTTYPES_H | |
42 # include <inttypes.h> /* uintptr_t, maybe */ | |
43 #endif | |
44 | |
45 #ifdef __cplusplus | |
46 extern "C" | |
47 { | |
48 #endif /* __cplusplus */ | |
49 | |
50 /* Windows annoyances -- since tests/hook.c uses some internal | |
51 FFTW functions, we need to given them the dllexport attribute | |
52 under Windows when compiling as a DLL (see api/fftw3.h). */ | |
53 #if defined(FFTW_EXTERN) | |
54 # define IFFTW_EXTERN FFTW_EXTERN | |
55 #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \ | |
56 && (defined(_WIN32) || defined(__WIN32__)) | |
57 # define IFFTW_EXTERN extern __declspec(dllexport) | |
58 #else | |
59 # define IFFTW_EXTERN extern | |
60 #endif | |
61 | |
62 /* determine precision and name-mangling scheme */ | |
63 #define CONCAT(prefix, name) prefix ## name | |
64 #if defined(FFTW_SINGLE) | |
65 typedef float R; | |
66 # define X(name) CONCAT(fftwf_, name) | |
67 #elif defined(FFTW_LDOUBLE) | |
68 typedef long double R; | |
69 # define X(name) CONCAT(fftwl_, name) | |
70 # define TRIGREAL_IS_LONG_DOUBLE | |
71 #elif defined(FFTW_QUAD) | |
72 typedef __float128 R; | |
73 # define X(name) CONCAT(fftwq_, name) | |
74 # define TRIGREAL_IS_QUAD | |
75 #else | |
76 typedef double R; | |
77 # define X(name) CONCAT(fftw_, name) | |
78 #endif | |
79 | |
80 /* | |
81 integral type large enough to contain a stride (what ``int'' should | |
82 have been in the first place. | |
83 */ | |
84 typedef ptrdiff_t INT; | |
85 | |
86 /* dummy use of unused parameters to silence compiler warnings */ | |
87 #define UNUSED(x) (void)x | |
88 | |
89 #define NELEM(array) ((sizeof(array) / sizeof((array)[0]))) | |
90 | |
91 #define FFT_SIGN (-1) /* sign convention for forward transforms */ | |
92 extern void X(extract_reim)(int sign, R *c, R **r, R **i); | |
93 | |
94 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s) | |
95 | |
96 #define STRINGIZEx(x) #x | |
97 #define STRINGIZE(x) STRINGIZEx(x) | |
98 #define CIMPLIES(ante, post) (!(ante) || (post)) | |
99 | |
100 /* define HAVE_SIMD if any simd extensions are supported */ | |
101 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || \ | |
102 defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \ | |
103 defined(HAVE_AVX2) || defined(HAVE_AVX512) || \ | |
104 defined(HAVE_KCVI) || \ | |
105 defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \ | |
106 defined(HAVE_MIPS_PS) || \ | |
107 defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256) | |
108 #define HAVE_SIMD 1 | |
109 #else | |
110 #define HAVE_SIMD 0 | |
111 #endif | |
112 | |
113 extern int X(have_simd_sse2)(void); | |
114 extern int X(have_simd_avx)(void); | |
115 extern int X(have_simd_avx_128_fma)(void); | |
116 extern int X(have_simd_avx2)(void); | |
117 extern int X(have_simd_avx2_128)(void); | |
118 extern int X(have_simd_avx512)(void); | |
119 extern int X(have_simd_altivec)(void); | |
120 extern int X(have_simd_vsx)(void); | |
121 extern int X(have_simd_neon)(void); | |
122 | |
123 /* forward declarations */ | |
124 typedef struct problem_s problem; | |
125 typedef struct plan_s plan; | |
126 typedef struct solver_s solver; | |
127 typedef struct planner_s planner; | |
128 typedef struct printer_s printer; | |
129 typedef struct scanner_s scanner; | |
130 | |
131 /*-----------------------------------------------------------------------*/ | |
132 /* alloca: */ | |
133 #if HAVE_SIMD | |
134 # if defined(HAVE_KCVI) || defined(HAVE_AVX512) | |
135 # define MIN_ALIGNMENT 64 | |
136 # elif defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_GENERIC_SIMD256) | |
137 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for | |
138 * everything else */ | |
139 # else | |
140 /* Note that we cannot use 32-byte alignment for all SIMD. For | |
141 example, MacOS X malloc is 16-byte aligned, but there was no | |
142 posix_memalign in MacOS X until version 10.6. */ | |
143 # define MIN_ALIGNMENT 16 | |
144 # endif | |
145 #endif | |
146 | |
147 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA) | |
148 /* use alloca if available */ | |
149 | |
150 #ifndef alloca | |
151 #ifdef __GNUC__ | |
152 # define alloca __builtin_alloca | |
153 #else | |
154 # ifdef _MSC_VER | |
155 # include <malloc.h> | |
156 # define alloca _alloca | |
157 # else | |
158 # if HAVE_ALLOCA_H | |
159 # include <alloca.h> | |
160 # else | |
161 # ifdef _AIX | |
162 #pragma alloca | |
163 # else | |
164 # ifndef alloca /* predefined by HP cc +Olibcalls */ | |
165 void *alloca(size_t); | |
166 # endif | |
167 # endif | |
168 # endif | |
169 # endif | |
170 #endif | |
171 #endif | |
172 | |
173 # ifdef MIN_ALIGNMENT | |
174 # define STACK_MALLOC(T, p, n) \ | |
175 { \ | |
176 p = (T)alloca((n) + MIN_ALIGNMENT); \ | |
177 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \ | |
178 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \ | |
179 } | |
180 # define STACK_FREE(n) | |
181 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */ | |
182 # define STACK_MALLOC(T, p, n) p = (T)alloca(n) | |
183 # define STACK_FREE(n) | |
184 # endif | |
185 | |
186 #else /* ! HAVE_ALLOCA */ | |
187 /* use malloc instead of alloca */ | |
188 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER) | |
189 # define STACK_FREE(n) X(ifree)(n) | |
190 #endif /* ! HAVE_ALLOCA */ | |
191 | |
192 /* allocation of buffers. If these grow too large use malloc(), else | |
193 use STACK_MALLOC (hopefully reducing to alloca()). */ | |
194 | |
195 /* 64KiB ought to be enough for anybody */ | |
196 #define MAX_STACK_ALLOC ((size_t)64 * 1024) | |
197 | |
198 #define BUF_ALLOC(T, p, n) \ | |
199 { \ | |
200 if (n < MAX_STACK_ALLOC) { \ | |
201 STACK_MALLOC(T, p, n); \ | |
202 } else { \ | |
203 p = (T)MALLOC(n, BUFFERS); \ | |
204 } \ | |
205 } | |
206 | |
207 #define BUF_FREE(p, n) \ | |
208 { \ | |
209 if (n < MAX_STACK_ALLOC) { \ | |
210 STACK_FREE(p); \ | |
211 } else { \ | |
212 X(ifree)(p); \ | |
213 } \ | |
214 } | |
215 | |
216 /*-----------------------------------------------------------------------*/ | |
217 /* define uintptr_t if it is not already defined */ | |
218 | |
219 #ifndef HAVE_UINTPTR_T | |
220 # if SIZEOF_VOID_P == 0 | |
221 # error sizeof void* is unknown! | |
222 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P | |
223 typedef unsigned int uintptr_t; | |
224 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P | |
225 typedef unsigned long uintptr_t; | |
226 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P | |
227 typedef unsigned long long uintptr_t; | |
228 # else | |
229 # error no unsigned integer type matches void* sizeof! | |
230 # endif | |
231 #endif | |
232 | |
233 /*-----------------------------------------------------------------------*/ | |
234 /* We can do an optimization for copying pairs of (aligned) floats | |
235 when in single precision if 2*float = double. */ | |
236 | |
237 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \ | |
238 && SIZEOF_FLOAT != 0 \ | |
239 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT) | |
240 | |
241 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0) | |
242 | |
243 /*-----------------------------------------------------------------------*/ | |
244 /* assert.c: */ | |
245 IFFTW_EXTERN void X(assertion_failed)(const char *s, | |
246 int line, const char *file); | |
247 | |
248 /* always check */ | |
249 #define CK(ex) \ | |
250 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0)) | |
251 | |
252 #ifdef FFTW_DEBUG | |
253 /* check only if debug enabled */ | |
254 #define A(ex) \ | |
255 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0)) | |
256 #else | |
257 #define A(ex) /* nothing */ | |
258 #endif | |
259 | |
260 extern void X(debug)(const char *format, ...); | |
261 #define D X(debug) | |
262 | |
263 /*-----------------------------------------------------------------------*/ | |
264 /* kalloc.c: */ | |
265 extern void *X(kernel_malloc)(size_t n); | |
266 extern void X(kernel_free)(void *p); | |
267 | |
268 /*-----------------------------------------------------------------------*/ | |
269 /* alloc.c: */ | |
270 | |
271 /* objects allocated by malloc, for statistical purposes */ | |
272 enum malloc_tag { | |
273 EVERYTHING, | |
274 PLANS, | |
275 SOLVERS, | |
276 PROBLEMS, | |
277 BUFFERS, | |
278 HASHT, | |
279 TENSORS, | |
280 PLANNERS, | |
281 SLVDESCS, | |
282 TWIDDLES, | |
283 STRIDES, | |
284 OTHER, | |
285 MALLOC_WHAT_LAST /* must be last */ | |
286 }; | |
287 | |
288 IFFTW_EXTERN void X(ifree)(void *ptr); | |
289 extern void X(ifree0)(void *ptr); | |
290 | |
291 #ifdef FFTW_DEBUG_MALLOC | |
292 | |
293 IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what, | |
294 const char *file, int line); | |
295 #define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__) | |
296 IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose); | |
297 | |
298 #else /* ! FFTW_DEBUG_MALLOC */ | |
299 | |
300 IFFTW_EXTERN void *X(malloc_plain)(size_t sz); | |
301 #define MALLOC(n, what) X(malloc_plain)(n) | |
302 | |
303 #endif | |
304 | |
305 #if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) || defined(HAVE_OPENMP)) | |
306 extern int X(in_thread); | |
307 # define IN_THREAD X(in_thread) | |
308 # define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1 | |
309 # define THREAD_OFF X(in_thread) = in_thread_save; } | |
310 #else | |
311 # define IN_THREAD 0 | |
312 # define THREAD_ON | |
313 # define THREAD_OFF | |
314 #endif | |
315 | |
316 /*-----------------------------------------------------------------------*/ | |
317 /* low-resolution clock */ | |
318 | |
319 #ifdef FAKE_CRUDE_TIME | |
320 typedef int crude_time; | |
321 #else | |
322 # if TIME_WITH_SYS_TIME | |
323 # include <sys/time.h> | |
324 # include <time.h> | |
325 # else | |
326 # if HAVE_SYS_TIME_H | |
327 # include <sys/time.h> | |
328 # else | |
329 # include <time.h> | |
330 # endif | |
331 # endif | |
332 | |
333 # ifdef HAVE_BSDGETTIMEOFDAY | |
334 # ifndef HAVE_GETTIMEOFDAY | |
335 # define gettimeofday BSDgettimeofday | |
336 # define HAVE_GETTIMEOFDAY 1 | |
337 # endif | |
338 # endif | |
339 | |
340 # if defined(HAVE_GETTIMEOFDAY) | |
341 typedef struct timeval crude_time; | |
342 # else | |
343 typedef clock_t crude_time; | |
344 # endif | |
345 #endif /* else FAKE_CRUDE_TIME */ | |
346 | |
347 crude_time X(get_crude_time)(void); | |
348 double X(elapsed_since)(const planner *plnr, const problem *p, | |
349 crude_time t0); /* time in seconds since t0 */ | |
350 | |
351 /*-----------------------------------------------------------------------*/ | |
352 /* ops.c: */ | |
353 /* | |
354 * ops counter. The total number of additions is add + fma | |
355 * and the total number of multiplications is mul + fma. | |
356 * Total flops = add + mul + 2 * fma | |
357 */ | |
358 typedef struct { | |
359 double add; | |
360 double mul; | |
361 double fma; | |
362 double other; | |
363 } opcnt; | |
364 | |
365 void X(ops_zero)(opcnt *dst); | |
366 void X(ops_other)(INT o, opcnt *dst); | |
367 void X(ops_cpy)(const opcnt *src, opcnt *dst); | |
368 | |
369 void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst); | |
370 void X(ops_add2)(const opcnt *a, opcnt *dst); | |
371 | |
372 /* dst = m * a + b */ | |
373 void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst); | |
374 | |
375 /* dst += m * a */ | |
376 void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst); | |
377 | |
378 | |
379 /*-----------------------------------------------------------------------*/ | |
380 /* minmax.c: */ | |
381 INT X(imax)(INT a, INT b); | |
382 INT X(imin)(INT a, INT b); | |
383 | |
384 /*-----------------------------------------------------------------------*/ | |
385 /* iabs.c: */ | |
386 INT X(iabs)(INT a); | |
387 | |
388 /* inline version */ | |
389 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x)) | |
390 | |
391 /*-----------------------------------------------------------------------*/ | |
392 /* md5.c */ | |
393 | |
394 #if SIZEOF_UNSIGNED_INT >= 4 | |
395 typedef unsigned int md5uint; | |
396 #else | |
397 typedef unsigned long md5uint; /* at least 32 bits as per C standard */ | |
398 #endif | |
399 | |
400 typedef md5uint md5sig[4]; | |
401 | |
402 typedef struct { | |
403 md5sig s; /* state and signature */ | |
404 | |
405 /* fields not meant to be used outside md5.c: */ | |
406 unsigned char c[64]; /* stuff not yet processed */ | |
407 unsigned l; /* total length. Should be 64 bits long, but this is | |
408 good enough for us */ | |
409 } md5; | |
410 | |
411 void X(md5begin)(md5 *p); | |
412 void X(md5putb)(md5 *p, const void *d_, size_t len); | |
413 void X(md5puts)(md5 *p, const char *s); | |
414 void X(md5putc)(md5 *p, unsigned char c); | |
415 void X(md5int)(md5 *p, int i); | |
416 void X(md5INT)(md5 *p, INT i); | |
417 void X(md5unsigned)(md5 *p, unsigned i); | |
418 void X(md5end)(md5 *p); | |
419 | |
420 /*-----------------------------------------------------------------------*/ | |
421 /* tensor.c: */ | |
422 #define STRUCT_HACK_KR | |
423 #undef STRUCT_HACK_C99 | |
424 | |
425 typedef struct { | |
426 INT n; | |
427 INT is; /* input stride */ | |
428 INT os; /* output stride */ | |
429 } iodim; | |
430 | |
431 typedef struct { | |
432 int rnk; | |
433 #if defined(STRUCT_HACK_KR) | |
434 iodim dims[1]; | |
435 #elif defined(STRUCT_HACK_C99) | |
436 iodim dims[]; | |
437 #else | |
438 iodim *dims; | |
439 #endif | |
440 } tensor; | |
441 | |
442 /* | |
443 Definition of rank -infinity. | |
444 This definition has the property that if you want rank 0 or 1, | |
445 you can simply test for rank <= 1. This is a common case. | |
446 | |
447 A tensor of rank -infinity has size 0. | |
448 */ | |
449 #define RNK_MINFTY INT_MAX | |
450 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY) | |
451 | |
452 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind; | |
453 | |
454 tensor *X(mktensor)(int rnk); | |
455 tensor *X(mktensor_0d)(void); | |
456 tensor *X(mktensor_1d)(INT n, INT is, INT os); | |
457 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0, | |
458 INT n1, INT is1, INT os1); | |
459 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0, | |
460 INT n1, INT is1, INT os1, | |
461 INT n2, INT is2, INT os2); | |
462 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0, | |
463 INT n1, INT is1, INT os1, | |
464 INT n2, INT is2, INT os2, | |
465 INT n3, INT is3, INT os3); | |
466 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0, | |
467 INT n1, INT is1, INT os1, | |
468 INT n2, INT is2, INT os2, | |
469 INT n3, INT is3, INT os3, | |
470 INT n4, INT is4, INT os4); | |
471 INT X(tensor_sz)(const tensor *sz); | |
472 void X(tensor_md5)(md5 *p, const tensor *t); | |
473 INT X(tensor_max_index)(const tensor *sz); | |
474 INT X(tensor_min_istride)(const tensor *sz); | |
475 INT X(tensor_min_ostride)(const tensor *sz); | |
476 INT X(tensor_min_stride)(const tensor *sz); | |
477 int X(tensor_inplace_strides)(const tensor *sz); | |
478 int X(tensor_inplace_strides2)(const tensor *a, const tensor *b); | |
479 int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz, | |
480 inplace_kind k); | |
481 tensor *X(tensor_copy)(const tensor *sz); | |
482 int X(tensor_kosherp)(const tensor *x); | |
483 | |
484 tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k); | |
485 tensor *X(tensor_copy_except)(const tensor *sz, int except_dim); | |
486 tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk); | |
487 tensor *X(tensor_compress)(const tensor *sz); | |
488 tensor *X(tensor_compress_contiguous)(const tensor *sz); | |
489 tensor *X(tensor_append)(const tensor *a, const tensor *b); | |
490 void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b); | |
491 int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os); | |
492 void X(tensor_destroy)(tensor *sz); | |
493 void X(tensor_destroy2)(tensor *a, tensor *b); | |
494 void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d); | |
495 void X(tensor_print)(const tensor *sz, printer *p); | |
496 int X(dimcmp)(const iodim *a, const iodim *b); | |
497 int X(tensor_equal)(const tensor *a, const tensor *b); | |
498 int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz); | |
499 | |
500 /*-----------------------------------------------------------------------*/ | |
501 /* problem.c: */ | |
502 enum { | |
503 /* a problem that cannot be solved */ | |
504 PROBLEM_UNSOLVABLE, | |
505 | |
506 PROBLEM_DFT, | |
507 PROBLEM_RDFT, | |
508 PROBLEM_RDFT2, | |
509 | |
510 /* for mpi/ subdirectory */ | |
511 PROBLEM_MPI_DFT, | |
512 PROBLEM_MPI_RDFT, | |
513 PROBLEM_MPI_RDFT2, | |
514 PROBLEM_MPI_TRANSPOSE, | |
515 | |
516 PROBLEM_LAST | |
517 }; | |
518 | |
519 typedef struct { | |
520 int problem_kind; | |
521 void (*hash) (const problem *ego, md5 *p); | |
522 void (*zero) (const problem *ego); | |
523 void (*print) (const problem *ego, printer *p); | |
524 void (*destroy) (problem *ego); | |
525 } problem_adt; | |
526 | |
527 struct problem_s { | |
528 const problem_adt *adt; | |
529 }; | |
530 | |
531 problem *X(mkproblem)(size_t sz, const problem_adt *adt); | |
532 void X(problem_destroy)(problem *ego); | |
533 problem *X(mkproblem_unsolvable)(void); | |
534 | |
535 /*-----------------------------------------------------------------------*/ | |
536 /* print.c */ | |
537 struct printer_s { | |
538 void (*print)(printer *p, const char *format, ...); | |
539 void (*vprint)(printer *p, const char *format, va_list ap); | |
540 void (*putchr)(printer *p, char c); | |
541 void (*cleanup)(printer *p); | |
542 int indent; | |
543 int indent_incr; | |
544 }; | |
545 | |
546 printer *X(mkprinter)(size_t size, | |
547 void (*putchr)(printer *p, char c), | |
548 void (*cleanup)(printer *p)); | |
549 IFFTW_EXTERN void X(printer_destroy)(printer *p); | |
550 | |
551 /*-----------------------------------------------------------------------*/ | |
552 /* scan.c */ | |
553 struct scanner_s { | |
554 int (*scan)(scanner *sc, const char *format, ...); | |
555 int (*vscan)(scanner *sc, const char *format, va_list ap); | |
556 int (*getchr)(scanner *sc); | |
557 int ungotc; | |
558 }; | |
559 | |
560 scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc)); | |
561 void X(scanner_destroy)(scanner *sc); | |
562 | |
563 /*-----------------------------------------------------------------------*/ | |
564 /* plan.c: */ | |
565 | |
566 enum wakefulness { | |
567 SLEEPY, | |
568 AWAKE_ZERO, | |
569 AWAKE_SQRTN_TABLE, | |
570 AWAKE_SINCOS | |
571 }; | |
572 | |
573 typedef struct { | |
574 void (*solve)(const plan *ego, const problem *p); | |
575 void (*awake)(plan *ego, enum wakefulness wakefulness); | |
576 void (*print)(const plan *ego, printer *p); | |
577 void (*destroy)(plan *ego); | |
578 } plan_adt; | |
579 | |
580 struct plan_s { | |
581 const plan_adt *adt; | |
582 opcnt ops; | |
583 double pcost; | |
584 enum wakefulness wakefulness; /* used for debugging only */ | |
585 int could_prune_now_p; | |
586 }; | |
587 | |
588 plan *X(mkplan)(size_t size, const plan_adt *adt); | |
589 void X(plan_destroy_internal)(plan *ego); | |
590 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness); | |
591 void X(plan_null_destroy)(plan *ego); | |
592 | |
593 /*-----------------------------------------------------------------------*/ | |
594 /* solver.c: */ | |
595 typedef struct { | |
596 int problem_kind; | |
597 plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr); | |
598 void (*destroy)(solver *ego); | |
599 } solver_adt; | |
600 | |
601 struct solver_s { | |
602 const solver_adt *adt; | |
603 int refcnt; | |
604 }; | |
605 | |
606 solver *X(mksolver)(size_t size, const solver_adt *adt); | |
607 void X(solver_use)(solver *ego); | |
608 void X(solver_destroy)(solver *ego); | |
609 void X(solver_register)(planner *plnr, solver *s); | |
610 | |
611 /* shorthand */ | |
612 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt) | |
613 | |
614 /*-----------------------------------------------------------------------*/ | |
615 /* planner.c */ | |
616 | |
617 typedef struct slvdesc_s { | |
618 solver *slv; | |
619 const char *reg_nam; | |
620 unsigned nam_hash; | |
621 int reg_id; | |
622 int next_for_same_problem_kind; | |
623 } slvdesc; | |
624 | |
625 typedef struct solution_s solution; /* opaque */ | |
626 | |
627 /* interpretation of L and U: | |
628 | |
629 - if it returns a plan, the planner guarantees that all applicable | |
630 plans at least as impatient as U have been tried, and that each | |
631 plan in the solution is at least as impatient as L. | |
632 | |
633 - if it returns 0, the planner guarantees to have tried all solvers | |
634 at least as impatient as L, and that none of them was applicable. | |
635 | |
636 The structure is packed to fit into 64 bits. | |
637 */ | |
638 | |
639 typedef struct { | |
640 unsigned l:20; | |
641 unsigned hash_info:3; | |
642 # define BITS_FOR_TIMELIMIT 9 | |
643 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT; | |
644 unsigned u:20; | |
645 | |
646 /* abstraction break: we store the solver here to pad the | |
647 structure to 64 bits. Otherwise, the struct is padded to 64 | |
648 bits anyway, and another word is allocated for slvndx. */ | |
649 # define BITS_FOR_SLVNDX 12 | |
650 unsigned slvndx:BITS_FOR_SLVNDX; | |
651 } flags_t; | |
652 | |
653 /* impatience flags */ | |
654 enum { | |
655 BELIEVE_PCOST = 0x0001, | |
656 ESTIMATE = 0x0002, | |
657 NO_DFT_R2HC = 0x0004, | |
658 NO_SLOW = 0x0008, | |
659 NO_VRECURSE = 0x0010, | |
660 NO_INDIRECT_OP = 0x0020, | |
661 NO_LARGE_GENERIC = 0x0040, | |
662 NO_RANK_SPLITS = 0x0080, | |
663 NO_VRANK_SPLITS = 0x0100, | |
664 NO_NONTHREADED = 0x0200, | |
665 NO_BUFFERING = 0x0400, | |
666 NO_FIXED_RADIX_LARGE_N = 0x0800, | |
667 NO_DESTROY_INPUT = 0x1000, | |
668 NO_SIMD = 0x2000, | |
669 CONSERVE_MEMORY = 0x4000, | |
670 NO_DHT_R2HC = 0x8000, | |
671 NO_UGLY = 0x10000, | |
672 ALLOW_PRUNING = 0x20000 | |
673 }; | |
674 | |
675 /* hashtable information */ | |
676 enum { | |
677 BLESSING = 0x1u, /* save this entry */ | |
678 H_VALID = 0x2u, /* valid hastable entry */ | |
679 H_LIVE = 0x4u /* entry is nonempty, implies H_VALID */ | |
680 }; | |
681 | |
682 #define PLNR_L(plnr) ((plnr)->flags.l) | |
683 #define PLNR_U(plnr) ((plnr)->flags.u) | |
684 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience) | |
685 | |
686 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE) | |
687 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST) | |
688 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING) | |
689 | |
690 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP) | |
691 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC) | |
692 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS) | |
693 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS) | |
694 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE) | |
695 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC) | |
696 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW) | |
697 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY) | |
698 #define NO_FIXED_RADIX_LARGE_NP(plnr) \ | |
699 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N) | |
700 #define NO_NONTHREADEDP(plnr) \ | |
701 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1) | |
702 | |
703 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT) | |
704 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD) | |
705 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY) | |
706 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC) | |
707 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING) | |
708 | |
709 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia; | |
710 | |
711 typedef enum { | |
712 /* WISDOM_NORMAL: planner may or may not use wisdom */ | |
713 WISDOM_NORMAL, | |
714 | |
715 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */ | |
716 WISDOM_ONLY, | |
717 | |
718 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */ | |
719 WISDOM_IS_BOGUS, | |
720 | |
721 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */ | |
722 WISDOM_IGNORE_INFEASIBLE, | |
723 | |
724 /* WISDOM_IGNORE_ALL: planner ignores all */ | |
725 WISDOM_IGNORE_ALL | |
726 } wisdom_state_t; | |
727 | |
728 typedef struct { | |
729 void (*register_solver)(planner *ego, solver *s); | |
730 plan *(*mkplan)(planner *ego, const problem *p); | |
731 void (*forget)(planner *ego, amnesia a); | |
732 void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved | |
733 word in C++. */ | |
734 int (*imprt)(planner *ego, scanner *sc); | |
735 } planner_adt; | |
736 | |
737 /* hash table of solutions */ | |
738 typedef struct { | |
739 solution *solutions; | |
740 unsigned hashsiz, nelem; | |
741 | |
742 /* statistics */ | |
743 int lookup, succ_lookup, lookup_iter; | |
744 int insert, insert_iter, insert_unknown; | |
745 int nrehash; | |
746 } hashtab; | |
747 | |
748 typedef enum { COST_SUM, COST_MAX } cost_kind; | |
749 | |
750 struct planner_s { | |
751 const planner_adt *adt; | |
752 void (*hook)(struct planner_s *plnr, plan *pln, | |
753 const problem *p, int optimalp); | |
754 double (*cost_hook)(const problem *p, double t, cost_kind k); | |
755 int (*wisdom_ok_hook)(const problem *p, flags_t flags); | |
756 void (*nowisdom_hook)(const problem *p); | |
757 wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p); | |
758 | |
759 /* solver descriptors */ | |
760 slvdesc *slvdescs; | |
761 unsigned nslvdesc, slvdescsiz; | |
762 const char *cur_reg_nam; | |
763 int cur_reg_id; | |
764 int slvdescs_for_problem_kind[PROBLEM_LAST]; | |
765 | |
766 wisdom_state_t wisdom_state; | |
767 | |
768 hashtab htab_blessed; | |
769 hashtab htab_unblessed; | |
770 | |
771 int nthr; | |
772 flags_t flags; | |
773 | |
774 crude_time start_time; | |
775 double timelimit; /* elapsed_since(start_time) at which to bail out */ | |
776 int timed_out; /* whether most recent search timed out */ | |
777 int need_timeout_check; | |
778 | |
779 /* various statistics */ | |
780 int nplan; /* number of plans evaluated */ | |
781 double pcost, epcost; /* total pcost of measured/estimated plans */ | |
782 int nprob; /* number of problems evaluated */ | |
783 }; | |
784 | |
785 planner *X(mkplanner)(void); | |
786 void X(planner_destroy)(planner *ego); | |
787 | |
788 /* | |
789 Iterate over all solvers. Read: | |
790 | |
791 @article{ baker93iterators, | |
792 author = "Henry G. Baker, Jr.", | |
793 title = "Iterators: Signs of Weakness in Object-Oriented Languages", | |
794 journal = "{ACM} {OOPS} Messenger", | |
795 volume = "4", | |
796 number = "3", | |
797 pages = "18--25" | |
798 } | |
799 */ | |
800 #define FORALL_SOLVERS(ego, s, p, what) \ | |
801 { \ | |
802 unsigned _cnt; \ | |
803 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \ | |
804 slvdesc *p = ego->slvdescs + _cnt; \ | |
805 solver *s = p->slv; \ | |
806 what; \ | |
807 } \ | |
808 } | |
809 | |
810 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \ | |
811 { \ | |
812 int _cnt = ego->slvdescs_for_problem_kind[kind]; \ | |
813 while (_cnt >= 0) { \ | |
814 slvdesc *p = ego->slvdescs + _cnt; \ | |
815 solver *s = p->slv; \ | |
816 what; \ | |
817 _cnt = p->next_for_same_problem_kind; \ | |
818 } \ | |
819 } | |
820 | |
821 | |
822 /* make plan, destroy problem */ | |
823 plan *X(mkplan_d)(planner *ego, problem *p); | |
824 plan *X(mkplan_f_d)(planner *ego, problem *p, | |
825 unsigned l_set, unsigned u_set, unsigned u_reset); | |
826 | |
827 /*-----------------------------------------------------------------------*/ | |
828 /* stride.c: */ | |
829 | |
830 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */ | |
831 #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE) | |
832 #define PRECOMPUTE_ARRAY_INDICES | |
833 #endif | |
834 | |
835 extern const INT X(an_INT_guaranteed_to_be_zero); | |
836 | |
837 #ifdef PRECOMPUTE_ARRAY_INDICES | |
838 typedef INT *stride; | |
839 #define WS(stride, i) (stride[i]) | |
840 extern stride X(mkstride)(INT n, INT s); | |
841 void X(stride_destroy)(stride p); | |
842 /* hackery to prevent the compiler from copying the strides array | |
843 onto the stack */ | |
844 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero) | |
845 #else | |
846 | |
847 typedef INT stride; | |
848 #define WS(stride, i) (stride * i) | |
849 #define fftwf_mkstride(n, stride) stride | |
850 #define fftw_mkstride(n, stride) stride | |
851 #define fftwl_mkstride(n, stride) stride | |
852 #define fftwf_stride_destroy(p) ((void) p) | |
853 #define fftw_stride_destroy(p) ((void) p) | |
854 #define fftwl_stride_destroy(p) ((void) p) | |
855 | |
856 /* hackery to prevent the compiler from ``optimizing'' induction | |
857 variables in codelet loops. The problem is that for each K and for | |
858 each expression of the form P[I + STRIDE * K] in a loop, most | |
859 compilers will try to lift an induction variable PK := &P[I + STRIDE * K]. | |
860 For large values of K this behavior overflows the | |
861 register set, which is likely worse than doing the index computation | |
862 in the first place. | |
863 | |
864 If we guess that there are more than | |
865 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse | |
866 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to | |
867 be 0, but the compiler does not know this. | |
868 | |
869 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's | |
870 seem to imply. | |
871 */ | |
872 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16 | |
873 #define MAKE_VOLATILE_STRIDE(nptr, x) \ | |
874 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \ | |
875 0 : \ | |
876 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero))) | |
877 #endif /* PRECOMPUTE_ARRAY_INDICES */ | |
878 | |
879 /*-----------------------------------------------------------------------*/ | |
880 /* solvtab.c */ | |
881 | |
882 struct solvtab_s { void (*reg)(planner *); const char *reg_nam; }; | |
883 typedef struct solvtab_s solvtab[]; | |
884 void X(solvtab_exec)(const solvtab tbl, planner *p); | |
885 #define SOLVTAB(s) { s, STRINGIZE(s) } | |
886 #define SOLVTAB_END { 0, 0 } | |
887 | |
888 /*-----------------------------------------------------------------------*/ | |
889 /* pickdim.c */ | |
890 int X(pickdim)(int which_dim, const int *buddies, size_t nbuddies, | |
891 const tensor *sz, int oop, int *dp); | |
892 | |
893 /*-----------------------------------------------------------------------*/ | |
894 /* twiddle.c */ | |
895 /* little language to express twiddle factors computation */ | |
896 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3, | |
897 TW_FULL = 4, TW_HALF = 5 }; | |
898 | |
899 typedef struct { | |
900 unsigned char op; | |
901 signed char v; | |
902 short i; | |
903 } tw_instr; | |
904 | |
905 typedef struct twid_s { | |
906 R *W; /* array of twiddle factors */ | |
907 INT n, r, m; /* transform order, radix, # twiddle rows */ | |
908 int refcnt; | |
909 const tw_instr *instr; | |
910 struct twid_s *cdr; | |
911 enum wakefulness wakefulness; | |
912 } twid; | |
913 | |
914 INT X(twiddle_length)(INT r, const tw_instr *p); | |
915 void X(twiddle_awake)(enum wakefulness wakefulness, | |
916 twid **pp, const tw_instr *instr, INT n, INT r, INT m); | |
917 | |
918 /*-----------------------------------------------------------------------*/ | |
919 /* trig.c */ | |
920 #if defined(TRIGREAL_IS_LONG_DOUBLE) | |
921 typedef long double trigreal; | |
922 #elif defined(TRIGREAL_IS_QUAD) | |
923 typedef __float128 trigreal; | |
924 #else | |
925 typedef double trigreal; | |
926 #endif | |
927 | |
928 typedef struct triggen_s triggen; | |
929 | |
930 struct triggen_s { | |
931 void (*cexp)(triggen *t, INT m, R *result); | |
932 void (*cexpl)(triggen *t, INT m, trigreal *result); | |
933 void (*rotate)(triggen *p, INT m, R xr, R xi, R *res); | |
934 | |
935 INT twshft; | |
936 INT twradix; | |
937 INT twmsk; | |
938 trigreal *W0, *W1; | |
939 INT n; | |
940 }; | |
941 | |
942 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n); | |
943 void X(triggen_destroy)(triggen *p); | |
944 | |
945 /*-----------------------------------------------------------------------*/ | |
946 /* primes.c: */ | |
947 | |
948 #define MULMOD(x, y, p) \ | |
949 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p)) | |
950 | |
951 INT X(safe_mulmod)(INT x, INT y, INT p); | |
952 INT X(power_mod)(INT n, INT m, INT p); | |
953 INT X(find_generator)(INT p); | |
954 INT X(first_divisor)(INT n); | |
955 int X(is_prime)(INT n); | |
956 INT X(next_prime)(INT n); | |
957 int X(factors_into)(INT n, const INT *primes); | |
958 int X(factors_into_small_primes)(INT n); | |
959 INT X(choose_radix)(INT r, INT n); | |
960 INT X(isqrt)(INT n); | |
961 INT X(modulo)(INT a, INT n); | |
962 | |
963 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */ | |
964 | |
965 /* thresholds below which certain solvers are considered SLOW. These are guesses | |
966 believed to be conservative */ | |
967 #define GENERIC_MAX_SLOW 16 | |
968 #define RADER_MAX_SLOW 32 | |
969 #define BLUESTEIN_MAX_SLOW 24 | |
970 | |
971 /*-----------------------------------------------------------------------*/ | |
972 /* rader.c: */ | |
973 typedef struct rader_tls rader_tl; | |
974 | |
975 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl); | |
976 R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t); | |
977 void X(rader_tl_delete)(R *W, rader_tl **tl); | |
978 | |
979 /*-----------------------------------------------------------------------*/ | |
980 /* copy/transposition routines */ | |
981 | |
982 /* lower bound to the cache size, for tiled routines */ | |
983 #define CACHESIZE 8192 | |
984 | |
985 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache); | |
986 | |
987 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz, | |
988 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args), | |
989 void *args); | |
990 void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl); | |
991 void X(zero1d_pair)(R *O0, R *O1, INT n0, INT os0); | |
992 void X(cpy2d)(R *I, R *O, | |
993 INT n0, INT is0, INT os0, | |
994 INT n1, INT is1, INT os1, | |
995 INT vl); | |
996 void X(cpy2d_ci)(R *I, R *O, | |
997 INT n0, INT is0, INT os0, | |
998 INT n1, INT is1, INT os1, | |
999 INT vl); | |
1000 void X(cpy2d_co)(R *I, R *O, | |
1001 INT n0, INT is0, INT os0, | |
1002 INT n1, INT is1, INT os1, | |
1003 INT vl); | |
1004 void X(cpy2d_tiled)(R *I, R *O, | |
1005 INT n0, INT is0, INT os0, | |
1006 INT n1, INT is1, INT os1, | |
1007 INT vl); | |
1008 void X(cpy2d_tiledbuf)(R *I, R *O, | |
1009 INT n0, INT is0, INT os0, | |
1010 INT n1, INT is1, INT os1, | |
1011 INT vl); | |
1012 void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1, | |
1013 INT n0, INT is0, INT os0, | |
1014 INT n1, INT is1, INT os1); | |
1015 void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1, | |
1016 INT n0, INT is0, INT os0, | |
1017 INT n1, INT is1, INT os1); | |
1018 void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1, | |
1019 INT n0, INT is0, INT os0, | |
1020 INT n1, INT is1, INT os1); | |
1021 | |
1022 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl); | |
1023 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl); | |
1024 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl); | |
1025 | |
1026 typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl); | |
1027 typedef void (*cpy2d_func)(R *I, R *O, | |
1028 INT n0, INT is0, INT os0, | |
1029 INT n1, INT is1, INT os1, | |
1030 INT vl); | |
1031 | |
1032 /*-----------------------------------------------------------------------*/ | |
1033 /* misc stuff */ | |
1034 void X(null_awake)(plan *ego, enum wakefulness wakefulness); | |
1035 double X(iestimate_cost)(const planner *, const plan *, const problem *); | |
1036 | |
1037 #ifdef FFTW_RANDOM_ESTIMATOR | |
1038 extern unsigned X(random_estimate_seed); | |
1039 #endif | |
1040 | |
1041 double X(measure_execution_time)(const planner *plnr, | |
1042 plan *pln, const problem *p); | |
1043 IFFTW_EXTERN int X(ialignment_of)(R *p); | |
1044 unsigned X(hash)(const char *s); | |
1045 INT X(nbuf)(INT n, INT vl, INT maxnbuf); | |
1046 int X(nbuf_redundant)(INT n, INT vl, size_t which, | |
1047 const INT *maxnbuf, size_t nmaxnbuf); | |
1048 INT X(bufdist)(INT n, INT vl); | |
1049 int X(toobig)(INT n); | |
1050 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r); | |
1051 | |
1052 #if HAVE_SIMD | |
1053 R *X(taint)(R *p, INT s); | |
1054 R *X(join_taint)(R *p1, R *p2); | |
1055 #define TAINT(p, s) X(taint)(p, s) | |
1056 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3)) | |
1057 #define TAINTOF(p) (((uintptr_t)(p)) & 3) | |
1058 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2) | |
1059 #else | |
1060 #define TAINT(p, s) (p) | |
1061 #define UNTAINT(p) (p) | |
1062 #define TAINTOF(p) 0 | |
1063 #define JOIN_TAINT(p1, p2) p1 | |
1064 #endif | |
1065 | |
1066 #ifdef FFTW_DEBUG_ALIGNMENT | |
1067 # define ASSERT_ALIGNED_DOUBLE { \ | |
1068 double __foo; \ | |
1069 CK(!(((uintptr_t) &__foo) & 0x7)); \ | |
1070 } | |
1071 #else | |
1072 # define ASSERT_ALIGNED_DOUBLE | |
1073 #endif /* FFTW_DEBUG_ALIGNMENT */ | |
1074 | |
1075 | |
1076 | |
1077 /*-----------------------------------------------------------------------*/ | |
1078 /* macros used in codelets to reduce source code size */ | |
1079 | |
1080 typedef R E; /* internal precision of codelets. */ | |
1081 | |
1082 #if defined(FFTW_LDOUBLE) | |
1083 # define K(x) ((E) x##L) | |
1084 #elif defined(FFTW_QUAD) | |
1085 # define K(x) ((E) x##Q) | |
1086 #else | |
1087 # define K(x) ((E) x) | |
1088 #endif | |
1089 #define DK(name, value) const E name = K(value) | |
1090 | |
1091 /* FMA macros */ | |
1092 | |
1093 #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER)) | |
1094 /* The obvious expression a * b + c does not work. If both x = a * b | |
1095 + c and y = a * b - c appear in the source, gcc computes t = a * b, | |
1096 x = t + c, y = t - c, thus destroying the fma. | |
1097 | |
1098 This peculiar coding seems to do the right thing on all of | |
1099 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing | |
1100 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable | |
1101 `x' for the single-assignment form). | |
1102 | |
1103 However, gcc-4.0 is a formidable adversary which succeeds in | |
1104 pessimizing two fma's into one multiplication and two additions. | |
1105 It does it very early in the game---before the optimization passes | |
1106 even start. The only real workaround seems to use fake inline asm | |
1107 such as | |
1108 | |
1109 asm ("# confuse gcc %0" : "=f"(a) : "0"(a)); | |
1110 return a * b + c; | |
1111 | |
1112 in each of the FMA, FMS, FNMA, and FNMS functions. However, this | |
1113 does not solve the problem either, because two equal asm statements | |
1114 count as a common subexpression! One must use *different* fake asm | |
1115 statements: | |
1116 | |
1117 in FMA: | |
1118 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a)); | |
1119 | |
1120 in FMS: | |
1121 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a)); | |
1122 | |
1123 etc. | |
1124 | |
1125 After these changes, gcc recalcitrantly generates the fma that was | |
1126 in the source to begin with. However, the extra asm() cruft | |
1127 confuses other passes of gcc, notably the instruction scheduler. | |
1128 (Of course, one could also generate the fma directly via inline | |
1129 asm, but this confuses the scheduler even more.) | |
1130 | |
1131 Steven and I have submitted more than one bug report to the gcc | |
1132 mailing list over the past few years, to no effect. Thus, I give | |
1133 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is | |
1134 out before touching this crap again. | |
1135 */ | |
1136 static __inline__ E FMA(E a, E b, E c) | |
1137 { | |
1138 E x = a * b; | |
1139 x = x + c; | |
1140 return x; | |
1141 } | |
1142 | |
1143 static __inline__ E FMS(E a, E b, E c) | |
1144 { | |
1145 E x = a * b; | |
1146 x = x - c; | |
1147 return x; | |
1148 } | |
1149 | |
1150 static __inline__ E FNMA(E a, E b, E c) | |
1151 { | |
1152 E x = a * b; | |
1153 x = - (x + c); | |
1154 return x; | |
1155 } | |
1156 | |
1157 static __inline__ E FNMS(E a, E b, E c) | |
1158 { | |
1159 E x = a * b; | |
1160 x = - (x - c); | |
1161 return x; | |
1162 } | |
1163 #else | |
1164 #define FMA(a, b, c) (((a) * (b)) + (c)) | |
1165 #define FMS(a, b, c) (((a) * (b)) - (c)) | |
1166 #define FNMA(a, b, c) (- (((a) * (b)) + (c))) | |
1167 #define FNMS(a, b, c) ((c) - ((a) * (b))) | |
1168 #endif | |
1169 | |
1170 #ifdef __cplusplus | |
1171 } /* extern "C" */ | |
1172 #endif /* __cplusplus */ | |
1173 | |
1174 #endif /* __IFFTW_H__ */ |