Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.8/libbench2/mp.c @ 82:d0c2a83c1364
Add FFTW 3.3.8 source, and a Linux build
author | Chris Cannam |
---|---|
date | Tue, 19 Nov 2019 14:52:55 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
81:7029a4916348 | 82:d0c2a83c1364 |
---|---|
1 #include "config.h" | |
2 #include "libbench2/bench.h" | |
3 #include <math.h> | |
4 | |
5 #define DG unsigned short | |
6 #define ACC unsigned long | |
7 #define REAL bench_real | |
8 #define BITS_IN_REAL 53 /* mantissa */ | |
9 | |
10 #define SHFT 16 | |
11 #define RADIX 65536L | |
12 #define IRADIX (1.0 / RADIX) | |
13 #define LO(x) ((x) & (RADIX - 1)) | |
14 #define HI(x) ((x) >> SHFT) | |
15 #define HI_SIGNED(x) \ | |
16 ((((x) + (ACC)(RADIX >> 1) * RADIX) >> SHFT) - (RADIX >> 1)) | |
17 #define ZEROEXP (-32768) | |
18 | |
19 #define LEN 10 | |
20 | |
21 typedef struct { | |
22 short sign; | |
23 short expt; | |
24 DG d[LEN]; | |
25 } N[1]; | |
26 | |
27 #define EXA a->expt | |
28 #define EXB b->expt | |
29 #define EXC c->expt | |
30 | |
31 #define AD a->d | |
32 #define BD b->d | |
33 | |
34 #define SGNA a->sign | |
35 #define SGNB b->sign | |
36 | |
37 static const N zero = {{ 1, ZEROEXP, {0} }}; | |
38 | |
39 static void cpy(const N a, N b) | |
40 { | |
41 *b = *a; | |
42 } | |
43 | |
44 static void fromreal(REAL x, N a) | |
45 { | |
46 int i, e; | |
47 | |
48 cpy(zero, a); | |
49 if (x == 0.0) return; | |
50 | |
51 if (x >= 0) { SGNA = 1; } | |
52 else { SGNA = -1; x = -x; } | |
53 | |
54 e = 0; | |
55 while (x >= 1.0) { x *= IRADIX; ++e; } | |
56 while (x < IRADIX) { x *= RADIX; --e; } | |
57 EXA = e; | |
58 | |
59 for (i = LEN - 1; i >= 0 && x != 0.0; --i) { | |
60 REAL y; | |
61 | |
62 x *= RADIX; | |
63 y = (REAL) ((int) x); | |
64 AD[i] = (DG)y; | |
65 x -= y; | |
66 } | |
67 } | |
68 | |
69 static void fromshort(int x, N a) | |
70 { | |
71 cpy(zero, a); | |
72 | |
73 if (x < 0) { x = -x; SGNA = -1; } | |
74 else { SGNA = 1; } | |
75 EXA = 1; | |
76 AD[LEN - 1] = x; | |
77 } | |
78 | |
79 static void pack(DG *d, int e, int s, int l, N a) | |
80 { | |
81 int i, j; | |
82 | |
83 for (i = l - 1; i >= 0; --i, --e) | |
84 if (d[i] != 0) | |
85 break; | |
86 | |
87 if (i < 0) { | |
88 /* number is zero */ | |
89 cpy(zero, a); | |
90 } else { | |
91 EXA = e; | |
92 SGNA = s; | |
93 | |
94 if (i >= LEN - 1) { | |
95 for (j = LEN - 1; j >= 0; --i, --j) | |
96 AD[j] = d[i]; | |
97 } else { | |
98 for (j = LEN - 1; i >= 0; --i, --j) | |
99 AD[j] = d[i]; | |
100 for ( ; j >= 0; --j) | |
101 AD[j] = 0; | |
102 } | |
103 } | |
104 } | |
105 | |
106 | |
107 /* compare absolute values */ | |
108 static int abscmp(const N a, const N b) | |
109 { | |
110 int i; | |
111 if (EXA > EXB) return 1; | |
112 if (EXA < EXB) return -1; | |
113 for (i = LEN - 1; i >= 0; --i) { | |
114 if (AD[i] > BD[i]) | |
115 return 1; | |
116 if (AD[i] < BD[i]) | |
117 return -1; | |
118 } | |
119 return 0; | |
120 } | |
121 | |
122 static int eq(const N a, const N b) | |
123 { | |
124 return (SGNA == SGNB) && (abscmp(a, b) == 0); | |
125 } | |
126 | |
127 /* add magnitudes, for |a| >= |b| */ | |
128 static void addmag0(int s, const N a, const N b, N c) | |
129 { | |
130 int ia, ib; | |
131 ACC r = 0; | |
132 DG d[LEN + 1]; | |
133 | |
134 for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) { | |
135 r += (ACC)AD[ia] + (ACC)BD[ib]; | |
136 d[ia] = LO(r); | |
137 r = HI(r); | |
138 } | |
139 for (; ia < LEN; ++ia) { | |
140 r += (ACC)AD[ia]; | |
141 d[ia] = LO(r); | |
142 r = HI(r); | |
143 } | |
144 d[ia] = LO(r); | |
145 pack(d, EXA + 1, s * SGNA, LEN + 1, c); | |
146 } | |
147 | |
148 static void addmag(int s, const N a, const N b, N c) | |
149 { | |
150 if (abscmp(a, b) > 0) addmag0(1, a, b, c); else addmag0(s, b, a, c); | |
151 } | |
152 | |
153 /* subtract magnitudes, for |a| >= |b| */ | |
154 static void submag0(int s, const N a, const N b, N c) | |
155 { | |
156 int ia, ib; | |
157 ACC r = 0; | |
158 DG d[LEN]; | |
159 | |
160 for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) { | |
161 r += (ACC)AD[ia] - (ACC)BD[ib]; | |
162 d[ia] = LO(r); | |
163 r = HI_SIGNED(r); | |
164 } | |
165 for (; ia < LEN; ++ia) { | |
166 r += (ACC)AD[ia]; | |
167 d[ia] = LO(r); | |
168 r = HI_SIGNED(r); | |
169 } | |
170 | |
171 pack(d, EXA, s * SGNA, LEN, c); | |
172 } | |
173 | |
174 static void submag(int s, const N a, const N b, N c) | |
175 { | |
176 if (abscmp(a, b) > 0) submag0(1, a, b, c); else submag0(s, b, a, c); | |
177 } | |
178 | |
179 /* c = a + b */ | |
180 static void add(const N a, const N b, N c) | |
181 { | |
182 if (SGNA == SGNB) addmag(1, a, b, c); else submag(1, a, b, c); | |
183 } | |
184 | |
185 static void sub(const N a, const N b, N c) | |
186 { | |
187 if (SGNA == SGNB) submag(-1, a, b, c); else addmag(-1, a, b, c); | |
188 } | |
189 | |
190 static void mul(const N a, const N b, N c) | |
191 { | |
192 DG d[2 * LEN]; | |
193 int i, j, k; | |
194 ACC r; | |
195 | |
196 for (i = 0; i < LEN; ++i) | |
197 d[2 * i] = d[2 * i + 1] = 0; | |
198 | |
199 for (i = 0; i < LEN; ++i) { | |
200 ACC ai = AD[i]; | |
201 if (ai) { | |
202 r = 0; | |
203 for (j = 0, k = i; j < LEN; ++j, ++k) { | |
204 r += ai * (ACC)BD[j] + (ACC)d[k]; | |
205 d[k] = LO(r); | |
206 r = HI(r); | |
207 } | |
208 d[k] = LO(r); | |
209 } | |
210 } | |
211 | |
212 pack(d, EXA + EXB, SGNA * SGNB, 2 * LEN, c); | |
213 } | |
214 | |
215 static REAL toreal(const N a) | |
216 { | |
217 REAL h, l, f; | |
218 int i, bits; | |
219 ACC r; | |
220 DG sticky; | |
221 | |
222 if (EXA != ZEROEXP) { | |
223 f = IRADIX; | |
224 i = LEN; | |
225 | |
226 bits = 0; | |
227 h = (r = AD[--i]) * f; f *= IRADIX; | |
228 for (bits = 0; r > 0; ++bits) | |
229 r >>= 1; | |
230 | |
231 /* first digit */ | |
232 while (bits + SHFT <= BITS_IN_REAL) { | |
233 h += AD[--i] * f; f *= IRADIX; bits += SHFT; | |
234 } | |
235 | |
236 /* guard digit (leave one bit for sticky bit, hence `<' instead | |
237 of `<=') */ | |
238 bits = 0; l = 0.0; | |
239 while (bits + SHFT < BITS_IN_REAL) { | |
240 l += AD[--i] * f; f *= IRADIX; bits += SHFT; | |
241 } | |
242 | |
243 /* sticky bit */ | |
244 sticky = 0; | |
245 while (i > 0) | |
246 sticky |= AD[--i]; | |
247 | |
248 if (sticky) | |
249 l += (RADIX / 2) * f; | |
250 | |
251 h += l; | |
252 | |
253 for (i = 0; i < EXA; ++i) h *= (REAL)RADIX; | |
254 for (i = 0; i > EXA; --i) h *= IRADIX; | |
255 if (SGNA == -1) h = -h; | |
256 return h; | |
257 } else { | |
258 return 0.0; | |
259 } | |
260 } | |
261 | |
262 static void neg(N a) | |
263 { | |
264 SGNA = -SGNA; | |
265 } | |
266 | |
267 static void inv(const N a, N x) | |
268 { | |
269 N w, z, one, two; | |
270 | |
271 fromreal(1.0 / toreal(a), x); /* initial guess */ | |
272 fromshort(1, one); | |
273 fromshort(2, two); | |
274 | |
275 for (;;) { | |
276 /* Newton */ | |
277 mul(a, x, w); | |
278 sub(two, w, z); | |
279 if (eq(one, z)) break; | |
280 mul(x, z, x); | |
281 } | |
282 } | |
283 | |
284 | |
285 /* 2 pi */ | |
286 static const N n2pi = {{ | |
287 1, 1, | |
288 {18450, 59017, 1760, 5212, 9779, 4518, 2886, 54545, 18558, 6} | |
289 }}; | |
290 | |
291 /* 1 / 31! */ | |
292 static const N i31fac = {{ | |
293 1, -7, | |
294 {28087, 45433, 51357, 24545, 14291, 3954, 57879, 8109, 38716, 41382} | |
295 }}; | |
296 | |
297 | |
298 /* 1 / 32! */ | |
299 static const N i32fac = {{ | |
300 1, -7, | |
301 {52078, 60811, 3652, 39679, 37310, 47227, 28432, 57597, 13497, 1293} | |
302 }}; | |
303 | |
304 static void msin(const N a, N b) | |
305 { | |
306 N a2, g, k; | |
307 int i; | |
308 | |
309 cpy(i31fac, g); | |
310 cpy(g, b); | |
311 mul(a, a, a2); | |
312 | |
313 /* Taylor */ | |
314 for (i = 31; i > 1; i -= 2) { | |
315 fromshort(i * (i - 1), k); | |
316 mul(k, g, g); | |
317 mul(a2, b, k); | |
318 sub(g, k, b); | |
319 } | |
320 mul(a, b, b); | |
321 } | |
322 | |
323 static void mcos(const N a, N b) | |
324 { | |
325 N a2, g, k; | |
326 int i; | |
327 | |
328 cpy(i32fac, g); | |
329 cpy(g, b); | |
330 mul(a, a, a2); | |
331 | |
332 /* Taylor */ | |
333 for (i = 32; i > 0; i -= 2) { | |
334 fromshort(i * (i - 1), k); | |
335 mul(k, g, g); | |
336 mul(a2, b, k); | |
337 sub(g, k, b); | |
338 } | |
339 } | |
340 | |
341 static void by2pi(REAL m, REAL n, N a) | |
342 { | |
343 N b; | |
344 | |
345 fromreal(n, b); | |
346 inv(b, a); | |
347 fromreal(m, b); | |
348 mul(a, b, a); | |
349 mul(n2pi, a, a); | |
350 } | |
351 | |
352 static void sin2pi(REAL m, REAL n, N a); | |
353 static void cos2pi(REAL m, REAL n, N a) | |
354 { | |
355 N b; | |
356 if (m < 0) cos2pi(-m, n, a); | |
357 else if (m > n * 0.5) cos2pi(n - m, n, a); | |
358 else if (m > n * 0.25) {sin2pi(m - n * 0.25, n, a); neg(a);} | |
359 else if (m > n * 0.125) sin2pi(n * 0.25 - m, n, a); | |
360 else { by2pi(m, n, b); mcos(b, a); } | |
361 } | |
362 | |
363 static void sin2pi(REAL m, REAL n, N a) | |
364 { | |
365 N b; | |
366 if (m < 0) {sin2pi(-m, n, a); neg(a);} | |
367 else if (m > n * 0.5) {sin2pi(n - m, n, a); neg(a);} | |
368 else if (m > n * 0.25) {cos2pi(m - n * 0.25, n, a);} | |
369 else if (m > n * 0.125) {cos2pi(n * 0.25 - m, n, a);} | |
370 else {by2pi(m, n, b); msin(b, a);} | |
371 } | |
372 | |
373 /*----------------------------------------------------------------------*/ | |
374 /* FFT stuff */ | |
375 | |
376 /* (r0 + i i0)(r1 + i i1) */ | |
377 static void cmul(N r0, N i0, N r1, N i1, N r2, N i2) | |
378 { | |
379 N s, t, q; | |
380 mul(r0, r1, s); | |
381 mul(i0, i1, t); | |
382 sub(s, t, q); | |
383 mul(r0, i1, s); | |
384 mul(i0, r1, t); | |
385 add(s, t, i2); | |
386 cpy(q, r2); | |
387 } | |
388 | |
389 /* (r0 - i i0)(r1 + i i1) */ | |
390 static void cmulj(N r0, N i0, N r1, N i1, N r2, N i2) | |
391 { | |
392 N s, t, q; | |
393 mul(r0, r1, s); | |
394 mul(i0, i1, t); | |
395 add(s, t, q); | |
396 mul(r0, i1, s); | |
397 mul(i0, r1, t); | |
398 sub(s, t, i2); | |
399 cpy(q, r2); | |
400 } | |
401 | |
402 static void mcexp(int m, int n, N r, N i) | |
403 { | |
404 static int cached_n = -1; | |
405 static N w[64][2]; | |
406 int k, j; | |
407 if (n != cached_n) { | |
408 for (j = 1, k = 0; j < n; j += j, ++k) { | |
409 cos2pi(j, n, w[k][0]); | |
410 sin2pi(j, n, w[k][1]); | |
411 } | |
412 cached_n = n; | |
413 } | |
414 | |
415 fromshort(1, r); | |
416 fromshort(0, i); | |
417 if (m > 0) { | |
418 for (k = 0; m; ++k, m >>= 1) | |
419 if (m & 1) | |
420 cmul(w[k][0], w[k][1], r, i, r, i); | |
421 } else { | |
422 m = -m; | |
423 for (k = 0; m; ++k, m >>= 1) | |
424 if (m & 1) | |
425 cmulj(w[k][0], w[k][1], r, i, r, i); | |
426 } | |
427 } | |
428 | |
429 static void bitrev(int n, N *a) | |
430 { | |
431 int i, j, m; | |
432 for (i = j = 0; i < n - 1; ++i) { | |
433 if (i < j) { | |
434 N t; | |
435 cpy(a[2*i], t); cpy(a[2*j], a[2*i]); cpy(t, a[2*j]); | |
436 cpy(a[2*i+1], t); cpy(a[2*j+1], a[2*i+1]); cpy(t, a[2*j+1]); | |
437 } | |
438 | |
439 /* bit reversed counter */ | |
440 m = n; do { m >>= 1; j ^= m; } while (!(j & m)); | |
441 } | |
442 } | |
443 | |
444 static void fft0(int n, N *a, int sign) | |
445 { | |
446 int i, j, k; | |
447 | |
448 bitrev(n, a); | |
449 for (i = 1; i < n; i = 2 * i) { | |
450 for (j = 0; j < i; ++j) { | |
451 N wr, wi; | |
452 mcexp(sign * (int)j, 2 * i, wr, wi); | |
453 for (k = j; k < n; k += 2 * i) { | |
454 N *a0 = a + 2 * k; | |
455 N *a1 = a0 + 2 * i; | |
456 N r0, i0, r1, i1, t0, t1, xr, xi; | |
457 cpy(a0[0], r0); cpy(a0[1], i0); | |
458 cpy(a1[0], r1); cpy(a1[1], i1); | |
459 mul(r1, wr, t0); mul(i1, wi, t1); sub(t0, t1, xr); | |
460 mul(r1, wi, t0); mul(i1, wr, t1); add(t0, t1, xi); | |
461 add(r0, xr, a0[0]); add(i0, xi, a0[1]); | |
462 sub(r0, xr, a1[0]); sub(i0, xi, a1[1]); | |
463 } | |
464 } | |
465 } | |
466 } | |
467 | |
468 /* a[2*k]+i*a[2*k+1] = exp(2*pi*i*k^2/(2*n)) */ | |
469 static void bluestein_sequence(int n, N *a) | |
470 { | |
471 int k, ksq, n2 = 2 * n; | |
472 | |
473 ksq = 1; /* (-1)^2 */ | |
474 for (k = 0; k < n; ++k) { | |
475 /* careful with overflow */ | |
476 ksq = ksq + 2*k - 1; while (ksq > n2) ksq -= n2; | |
477 mcexp(ksq, n2, a[2*k], a[2*k+1]); | |
478 } | |
479 } | |
480 | |
481 static int pow2_atleast(int x) | |
482 { | |
483 int h; | |
484 for (h = 1; h < x; h = 2 * h) | |
485 ; | |
486 return h; | |
487 } | |
488 | |
489 static N *cached_bluestein_w = 0; | |
490 static N *cached_bluestein_y = 0; | |
491 static int cached_bluestein_n = -1; | |
492 | |
493 static void bluestein(int n, N *a) | |
494 { | |
495 int nb = pow2_atleast(2 * n); | |
496 N *b = (N *)bench_malloc(2 * nb * sizeof(N)); | |
497 N *w = cached_bluestein_w; | |
498 N *y = cached_bluestein_y; | |
499 N nbinv; | |
500 int i; | |
501 | |
502 fromreal(1.0 / nb, nbinv); /* exact because nb = 2^k */ | |
503 | |
504 if (cached_bluestein_n != n) { | |
505 if (w) bench_free(w); | |
506 if (y) bench_free(y); | |
507 w = (N *)bench_malloc(2 * n * sizeof(N)); | |
508 y = (N *)bench_malloc(2 * nb * sizeof(N)); | |
509 cached_bluestein_n = n; | |
510 cached_bluestein_w = w; | |
511 cached_bluestein_y = y; | |
512 | |
513 bluestein_sequence(n, w); | |
514 for (i = 0; i < 2*nb; ++i) cpy(zero, y[i]); | |
515 | |
516 for (i = 0; i < n; ++i) { | |
517 cpy(w[2*i], y[2*i]); | |
518 cpy(w[2*i+1], y[2*i+1]); | |
519 } | |
520 for (i = 1; i < n; ++i) { | |
521 cpy(w[2*i], y[2*(nb-i)]); | |
522 cpy(w[2*i+1], y[2*(nb-i)+1]); | |
523 } | |
524 | |
525 fft0(nb, y, -1); | |
526 } | |
527 | |
528 for (i = 0; i < 2*nb; ++i) cpy(zero, b[i]); | |
529 | |
530 for (i = 0; i < n; ++i) | |
531 cmulj(w[2*i], w[2*i+1], a[2*i], a[2*i+1], b[2*i], b[2*i+1]); | |
532 | |
533 /* scaled convolution b * y */ | |
534 fft0(nb, b, -1); | |
535 | |
536 for (i = 0; i < nb; ++i) | |
537 cmul(b[2*i], b[2*i+1], y[2*i], y[2*i+1], b[2*i], b[2*i+1]); | |
538 fft0(nb, b, 1); | |
539 | |
540 for (i = 0; i < n; ++i) { | |
541 cmulj(w[2*i], w[2*i+1], b[2*i], b[2*i+1], a[2*i], a[2*i+1]); | |
542 mul(nbinv, a[2*i], a[2*i]); | |
543 mul(nbinv, a[2*i+1], a[2*i+1]); | |
544 } | |
545 | |
546 bench_free(b); | |
547 } | |
548 | |
549 static void swapri(int n, N *a) | |
550 { | |
551 int i; | |
552 for (i = 0; i < n; ++i) { | |
553 N t; | |
554 cpy(a[2 * i], t); | |
555 cpy(a[2 * i + 1], a[2 * i]); | |
556 cpy(t, a[2 * i + 1]); | |
557 } | |
558 } | |
559 | |
560 static void fft1(int n, N *a, int sign) | |
561 { | |
562 if (power_of_two(n)) { | |
563 fft0(n, a, sign); | |
564 } else { | |
565 if (sign == 1) swapri(n, a); | |
566 bluestein(n, a); | |
567 if (sign == 1) swapri(n, a); | |
568 } | |
569 } | |
570 | |
571 static void fromrealv(int n, bench_complex *a, N *b) | |
572 { | |
573 int i; | |
574 | |
575 for (i = 0; i < n; ++i) { | |
576 fromreal(c_re(a[i]), b[2 * i]); | |
577 fromreal(c_im(a[i]), b[2 * i + 1]); | |
578 } | |
579 } | |
580 | |
581 static void compare(int n, N *a, N *b, double *err) | |
582 { | |
583 int i; | |
584 double e1, e2, einf; | |
585 double n1, n2, ninf; | |
586 | |
587 e1 = e2 = einf = 0.0; | |
588 n1 = n2 = ninf = 0.0; | |
589 | |
590 # define DO(x1, x2, xinf, var) { \ | |
591 double d = var; \ | |
592 if (d < 0) d = -d; \ | |
593 x1 += d; x2 += d * d; if (d > xinf) xinf = d; \ | |
594 } | |
595 | |
596 for (i = 0; i < 2 * n; ++i) { | |
597 N dd; | |
598 sub(a[i], b[i], dd); | |
599 DO(n1, n2, ninf, toreal(a[i])); | |
600 DO(e1, e2, einf, toreal(dd)); | |
601 } | |
602 | |
603 # undef DO | |
604 err[0] = e1 / n1; | |
605 err[1] = sqrt(e2 / n2); | |
606 err[2] = einf / ninf; | |
607 } | |
608 | |
609 void fftaccuracy(int n, bench_complex *a, bench_complex *ffta, | |
610 int sign, double err[6]) | |
611 { | |
612 N *b = (N *)bench_malloc(2 * n * sizeof(N)); | |
613 N *fftb = (N *)bench_malloc(2 * n * sizeof(N)); | |
614 N mn, ninv; | |
615 int i; | |
616 | |
617 fromreal(n, mn); inv(mn, ninv); | |
618 | |
619 /* forward error */ | |
620 fromrealv(n, a, b); fromrealv(n, ffta, fftb); | |
621 fft1(n, b, sign); | |
622 compare(n, b, fftb, err); | |
623 | |
624 /* backward error */ | |
625 fromrealv(n, a, b); fromrealv(n, ffta, fftb); | |
626 for (i = 0; i < 2 * n; ++i) mul(fftb[i], ninv, fftb[i]); | |
627 fft1(n, fftb, -sign); | |
628 compare(n, b, fftb, err + 3); | |
629 | |
630 bench_free(fftb); | |
631 bench_free(b); | |
632 } | |
633 | |
634 void fftaccuracy_done(void) | |
635 { | |
636 if (cached_bluestein_w) bench_free(cached_bluestein_w); | |
637 if (cached_bluestein_y) bench_free(cached_bluestein_y); | |
638 cached_bluestein_w = 0; | |
639 cached_bluestein_y = 0; | |
640 cached_bluestein_n = -1; | |
641 } |