cannam@127
|
1 #include "config.h"
|
cannam@127
|
2 #include "bench.h"
|
cannam@127
|
3 #include <math.h>
|
cannam@127
|
4
|
cannam@127
|
5 #define DG unsigned short
|
cannam@127
|
6 #define ACC unsigned long
|
cannam@127
|
7 #define REAL bench_real
|
cannam@127
|
8 #define BITS_IN_REAL 53 /* mantissa */
|
cannam@127
|
9
|
cannam@127
|
10 #define SHFT 16
|
cannam@127
|
11 #define RADIX 65536L
|
cannam@127
|
12 #define IRADIX (1.0 / RADIX)
|
cannam@127
|
13 #define LO(x) ((x) & (RADIX - 1))
|
cannam@127
|
14 #define HI(x) ((x) >> SHFT)
|
cannam@127
|
15 #define HI_SIGNED(x) \
|
cannam@127
|
16 ((((x) + (ACC)(RADIX >> 1) * RADIX) >> SHFT) - (RADIX >> 1))
|
cannam@127
|
17 #define ZEROEXP (-32768)
|
cannam@127
|
18
|
cannam@127
|
19 #define LEN 10
|
cannam@127
|
20
|
cannam@127
|
21 typedef struct {
|
cannam@127
|
22 short sign;
|
cannam@127
|
23 short expt;
|
cannam@127
|
24 DG d[LEN];
|
cannam@127
|
25 } N[1];
|
cannam@127
|
26
|
cannam@127
|
27 #define EXA a->expt
|
cannam@127
|
28 #define EXB b->expt
|
cannam@127
|
29 #define EXC c->expt
|
cannam@127
|
30
|
cannam@127
|
31 #define AD a->d
|
cannam@127
|
32 #define BD b->d
|
cannam@127
|
33
|
cannam@127
|
34 #define SGNA a->sign
|
cannam@127
|
35 #define SGNB b->sign
|
cannam@127
|
36
|
cannam@127
|
37 static const N zero = {{ 1, ZEROEXP, {0} }};
|
cannam@127
|
38
|
cannam@127
|
39 static void cpy(const N a, N b)
|
cannam@127
|
40 {
|
cannam@127
|
41 *b = *a;
|
cannam@127
|
42 }
|
cannam@127
|
43
|
cannam@127
|
44 static void fromreal(REAL x, N a)
|
cannam@127
|
45 {
|
cannam@127
|
46 int i, e;
|
cannam@127
|
47
|
cannam@127
|
48 cpy(zero, a);
|
cannam@127
|
49 if (x == 0.0) return;
|
cannam@127
|
50
|
cannam@127
|
51 if (x >= 0) { SGNA = 1; }
|
cannam@127
|
52 else { SGNA = -1; x = -x; }
|
cannam@127
|
53
|
cannam@127
|
54 e = 0;
|
cannam@127
|
55 while (x >= 1.0) { x *= IRADIX; ++e; }
|
cannam@127
|
56 while (x < IRADIX) { x *= RADIX; --e; }
|
cannam@127
|
57 EXA = e;
|
cannam@127
|
58
|
cannam@127
|
59 for (i = LEN - 1; i >= 0 && x != 0.0; --i) {
|
cannam@127
|
60 REAL y;
|
cannam@127
|
61
|
cannam@127
|
62 x *= RADIX;
|
cannam@127
|
63 y = (REAL) ((int) x);
|
cannam@127
|
64 AD[i] = (DG)y;
|
cannam@127
|
65 x -= y;
|
cannam@127
|
66 }
|
cannam@127
|
67 }
|
cannam@127
|
68
|
cannam@127
|
69 static void fromshort(int x, N a)
|
cannam@127
|
70 {
|
cannam@127
|
71 cpy(zero, a);
|
cannam@127
|
72
|
cannam@127
|
73 if (x < 0) { x = -x; SGNA = -1; }
|
cannam@127
|
74 else { SGNA = 1; }
|
cannam@127
|
75 EXA = 1;
|
cannam@127
|
76 AD[LEN - 1] = x;
|
cannam@127
|
77 }
|
cannam@127
|
78
|
cannam@127
|
79 static void pack(DG *d, int e, int s, int l, N a)
|
cannam@127
|
80 {
|
cannam@127
|
81 int i, j;
|
cannam@127
|
82
|
cannam@127
|
83 for (i = l - 1; i >= 0; --i, --e)
|
cannam@127
|
84 if (d[i] != 0)
|
cannam@127
|
85 break;
|
cannam@127
|
86
|
cannam@127
|
87 if (i < 0) {
|
cannam@127
|
88 /* number is zero */
|
cannam@127
|
89 cpy(zero, a);
|
cannam@127
|
90 } else {
|
cannam@127
|
91 EXA = e;
|
cannam@127
|
92 SGNA = s;
|
cannam@127
|
93
|
cannam@127
|
94 if (i >= LEN - 1) {
|
cannam@127
|
95 for (j = LEN - 1; j >= 0; --i, --j)
|
cannam@127
|
96 AD[j] = d[i];
|
cannam@127
|
97 } else {
|
cannam@127
|
98 for (j = LEN - 1; i >= 0; --i, --j)
|
cannam@127
|
99 AD[j] = d[i];
|
cannam@127
|
100 for ( ; j >= 0; --j)
|
cannam@127
|
101 AD[j] = 0;
|
cannam@127
|
102 }
|
cannam@127
|
103 }
|
cannam@127
|
104 }
|
cannam@127
|
105
|
cannam@127
|
106
|
cannam@127
|
107 /* compare absolute values */
|
cannam@127
|
108 static int abscmp(const N a, const N b)
|
cannam@127
|
109 {
|
cannam@127
|
110 int i;
|
cannam@127
|
111 if (EXA > EXB) return 1;
|
cannam@127
|
112 if (EXA < EXB) return -1;
|
cannam@127
|
113 for (i = LEN - 1; i >= 0; --i) {
|
cannam@127
|
114 if (AD[i] > BD[i])
|
cannam@127
|
115 return 1;
|
cannam@127
|
116 if (AD[i] < BD[i])
|
cannam@127
|
117 return -1;
|
cannam@127
|
118 }
|
cannam@127
|
119 return 0;
|
cannam@127
|
120 }
|
cannam@127
|
121
|
cannam@127
|
122 static int eq(const N a, const N b)
|
cannam@127
|
123 {
|
cannam@127
|
124 return (SGNA == SGNB) && (abscmp(a, b) == 0);
|
cannam@127
|
125 }
|
cannam@127
|
126
|
cannam@127
|
127 /* add magnitudes, for |a| >= |b| */
|
cannam@127
|
128 static void addmag0(int s, const N a, const N b, N c)
|
cannam@127
|
129 {
|
cannam@127
|
130 int ia, ib;
|
cannam@127
|
131 ACC r = 0;
|
cannam@127
|
132 DG d[LEN + 1];
|
cannam@127
|
133
|
cannam@127
|
134 for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
|
cannam@127
|
135 r += (ACC)AD[ia] + (ACC)BD[ib];
|
cannam@127
|
136 d[ia] = LO(r);
|
cannam@127
|
137 r = HI(r);
|
cannam@127
|
138 }
|
cannam@127
|
139 for (; ia < LEN; ++ia) {
|
cannam@127
|
140 r += (ACC)AD[ia];
|
cannam@127
|
141 d[ia] = LO(r);
|
cannam@127
|
142 r = HI(r);
|
cannam@127
|
143 }
|
cannam@127
|
144 d[ia] = LO(r);
|
cannam@127
|
145 pack(d, EXA + 1, s * SGNA, LEN + 1, c);
|
cannam@127
|
146 }
|
cannam@127
|
147
|
cannam@127
|
148 static void addmag(int s, const N a, const N b, N c)
|
cannam@127
|
149 {
|
cannam@127
|
150 if (abscmp(a, b) > 0) addmag0(1, a, b, c); else addmag0(s, b, a, c);
|
cannam@127
|
151 }
|
cannam@127
|
152
|
cannam@127
|
153 /* subtract magnitudes, for |a| >= |b| */
|
cannam@127
|
154 static void submag0(int s, const N a, const N b, N c)
|
cannam@127
|
155 {
|
cannam@127
|
156 int ia, ib;
|
cannam@127
|
157 ACC r = 0;
|
cannam@127
|
158 DG d[LEN];
|
cannam@127
|
159
|
cannam@127
|
160 for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
|
cannam@127
|
161 r += (ACC)AD[ia] - (ACC)BD[ib];
|
cannam@127
|
162 d[ia] = LO(r);
|
cannam@127
|
163 r = HI_SIGNED(r);
|
cannam@127
|
164 }
|
cannam@127
|
165 for (; ia < LEN; ++ia) {
|
cannam@127
|
166 r += (ACC)AD[ia];
|
cannam@127
|
167 d[ia] = LO(r);
|
cannam@127
|
168 r = HI_SIGNED(r);
|
cannam@127
|
169 }
|
cannam@127
|
170
|
cannam@127
|
171 pack(d, EXA, s * SGNA, LEN, c);
|
cannam@127
|
172 }
|
cannam@127
|
173
|
cannam@127
|
174 static void submag(int s, const N a, const N b, N c)
|
cannam@127
|
175 {
|
cannam@127
|
176 if (abscmp(a, b) > 0) submag0(1, a, b, c); else submag0(s, b, a, c);
|
cannam@127
|
177 }
|
cannam@127
|
178
|
cannam@127
|
179 /* c = a + b */
|
cannam@127
|
180 static void add(const N a, const N b, N c)
|
cannam@127
|
181 {
|
cannam@127
|
182 if (SGNA == SGNB) addmag(1, a, b, c); else submag(1, a, b, c);
|
cannam@127
|
183 }
|
cannam@127
|
184
|
cannam@127
|
185 static void sub(const N a, const N b, N c)
|
cannam@127
|
186 {
|
cannam@127
|
187 if (SGNA == SGNB) submag(-1, a, b, c); else addmag(-1, a, b, c);
|
cannam@127
|
188 }
|
cannam@127
|
189
|
cannam@127
|
190 static void mul(const N a, const N b, N c)
|
cannam@127
|
191 {
|
cannam@127
|
192 DG d[2 * LEN];
|
cannam@127
|
193 int i, j, k;
|
cannam@127
|
194 ACC r;
|
cannam@127
|
195
|
cannam@127
|
196 for (i = 0; i < LEN; ++i)
|
cannam@127
|
197 d[2 * i] = d[2 * i + 1] = 0;
|
cannam@127
|
198
|
cannam@127
|
199 for (i = 0; i < LEN; ++i) {
|
cannam@127
|
200 ACC ai = AD[i];
|
cannam@127
|
201 if (ai) {
|
cannam@127
|
202 r = 0;
|
cannam@127
|
203 for (j = 0, k = i; j < LEN; ++j, ++k) {
|
cannam@127
|
204 r += ai * (ACC)BD[j] + (ACC)d[k];
|
cannam@127
|
205 d[k] = LO(r);
|
cannam@127
|
206 r = HI(r);
|
cannam@127
|
207 }
|
cannam@127
|
208 d[k] = LO(r);
|
cannam@127
|
209 }
|
cannam@127
|
210 }
|
cannam@127
|
211
|
cannam@127
|
212 pack(d, EXA + EXB, SGNA * SGNB, 2 * LEN, c);
|
cannam@127
|
213 }
|
cannam@127
|
214
|
cannam@127
|
215 static REAL toreal(const N a)
|
cannam@127
|
216 {
|
cannam@127
|
217 REAL h, l, f;
|
cannam@127
|
218 int i, bits;
|
cannam@127
|
219 ACC r;
|
cannam@127
|
220 DG sticky;
|
cannam@127
|
221
|
cannam@127
|
222 if (EXA != ZEROEXP) {
|
cannam@127
|
223 f = IRADIX;
|
cannam@127
|
224 i = LEN;
|
cannam@127
|
225
|
cannam@127
|
226 bits = 0;
|
cannam@127
|
227 h = (r = AD[--i]) * f; f *= IRADIX;
|
cannam@127
|
228 for (bits = 0; r > 0; ++bits)
|
cannam@127
|
229 r >>= 1;
|
cannam@127
|
230
|
cannam@127
|
231 /* first digit */
|
cannam@127
|
232 while (bits + SHFT <= BITS_IN_REAL) {
|
cannam@127
|
233 h += AD[--i] * f; f *= IRADIX; bits += SHFT;
|
cannam@127
|
234 }
|
cannam@127
|
235
|
cannam@127
|
236 /* guard digit (leave one bit for sticky bit, hence `<' instead
|
cannam@127
|
237 of `<=') */
|
cannam@127
|
238 bits = 0; l = 0.0;
|
cannam@127
|
239 while (bits + SHFT < BITS_IN_REAL) {
|
cannam@127
|
240 l += AD[--i] * f; f *= IRADIX; bits += SHFT;
|
cannam@127
|
241 }
|
cannam@127
|
242
|
cannam@127
|
243 /* sticky bit */
|
cannam@127
|
244 sticky = 0;
|
cannam@127
|
245 while (i > 0)
|
cannam@127
|
246 sticky |= AD[--i];
|
cannam@127
|
247
|
cannam@127
|
248 if (sticky)
|
cannam@127
|
249 l += (RADIX / 2) * f;
|
cannam@127
|
250
|
cannam@127
|
251 h += l;
|
cannam@127
|
252
|
cannam@127
|
253 for (i = 0; i < EXA; ++i) h *= (REAL)RADIX;
|
cannam@127
|
254 for (i = 0; i > EXA; --i) h *= IRADIX;
|
cannam@127
|
255 if (SGNA == -1) h = -h;
|
cannam@127
|
256 return h;
|
cannam@127
|
257 } else {
|
cannam@127
|
258 return 0.0;
|
cannam@127
|
259 }
|
cannam@127
|
260 }
|
cannam@127
|
261
|
cannam@127
|
262 static void neg(N a)
|
cannam@127
|
263 {
|
cannam@127
|
264 SGNA = -SGNA;
|
cannam@127
|
265 }
|
cannam@127
|
266
|
cannam@127
|
267 static void inv(const N a, N x)
|
cannam@127
|
268 {
|
cannam@127
|
269 N w, z, one, two;
|
cannam@127
|
270
|
cannam@127
|
271 fromreal(1.0 / toreal(a), x); /* initial guess */
|
cannam@127
|
272 fromshort(1, one);
|
cannam@127
|
273 fromshort(2, two);
|
cannam@127
|
274
|
cannam@127
|
275 for (;;) {
|
cannam@127
|
276 /* Newton */
|
cannam@127
|
277 mul(a, x, w);
|
cannam@127
|
278 sub(two, w, z);
|
cannam@127
|
279 if (eq(one, z)) break;
|
cannam@127
|
280 mul(x, z, x);
|
cannam@127
|
281 }
|
cannam@127
|
282 }
|
cannam@127
|
283
|
cannam@127
|
284
|
cannam@127
|
285 /* 2 pi */
|
cannam@127
|
286 static const N n2pi = {{
|
cannam@127
|
287 1, 1,
|
cannam@127
|
288 {18450, 59017, 1760, 5212, 9779, 4518, 2886, 54545, 18558, 6}
|
cannam@127
|
289 }};
|
cannam@127
|
290
|
cannam@127
|
291 /* 1 / 31! */
|
cannam@127
|
292 static const N i31fac = {{
|
cannam@127
|
293 1, -7,
|
cannam@127
|
294 {28087, 45433, 51357, 24545, 14291, 3954, 57879, 8109, 38716, 41382}
|
cannam@127
|
295 }};
|
cannam@127
|
296
|
cannam@127
|
297
|
cannam@127
|
298 /* 1 / 32! */
|
cannam@127
|
299 static const N i32fac = {{
|
cannam@127
|
300 1, -7,
|
cannam@127
|
301 {52078, 60811, 3652, 39679, 37310, 47227, 28432, 57597, 13497, 1293}
|
cannam@127
|
302 }};
|
cannam@127
|
303
|
cannam@127
|
304 static void msin(const N a, N b)
|
cannam@127
|
305 {
|
cannam@127
|
306 N a2, g, k;
|
cannam@127
|
307 int i;
|
cannam@127
|
308
|
cannam@127
|
309 cpy(i31fac, g);
|
cannam@127
|
310 cpy(g, b);
|
cannam@127
|
311 mul(a, a, a2);
|
cannam@127
|
312
|
cannam@127
|
313 /* Taylor */
|
cannam@127
|
314 for (i = 31; i > 1; i -= 2) {
|
cannam@127
|
315 fromshort(i * (i - 1), k);
|
cannam@127
|
316 mul(k, g, g);
|
cannam@127
|
317 mul(a2, b, k);
|
cannam@127
|
318 sub(g, k, b);
|
cannam@127
|
319 }
|
cannam@127
|
320 mul(a, b, b);
|
cannam@127
|
321 }
|
cannam@127
|
322
|
cannam@127
|
323 static void mcos(const N a, N b)
|
cannam@127
|
324 {
|
cannam@127
|
325 N a2, g, k;
|
cannam@127
|
326 int i;
|
cannam@127
|
327
|
cannam@127
|
328 cpy(i32fac, g);
|
cannam@127
|
329 cpy(g, b);
|
cannam@127
|
330 mul(a, a, a2);
|
cannam@127
|
331
|
cannam@127
|
332 /* Taylor */
|
cannam@127
|
333 for (i = 32; i > 0; i -= 2) {
|
cannam@127
|
334 fromshort(i * (i - 1), k);
|
cannam@127
|
335 mul(k, g, g);
|
cannam@127
|
336 mul(a2, b, k);
|
cannam@127
|
337 sub(g, k, b);
|
cannam@127
|
338 }
|
cannam@127
|
339 }
|
cannam@127
|
340
|
cannam@127
|
341 static void by2pi(REAL m, REAL n, N a)
|
cannam@127
|
342 {
|
cannam@127
|
343 N b;
|
cannam@127
|
344
|
cannam@127
|
345 fromreal(n, b);
|
cannam@127
|
346 inv(b, a);
|
cannam@127
|
347 fromreal(m, b);
|
cannam@127
|
348 mul(a, b, a);
|
cannam@127
|
349 mul(n2pi, a, a);
|
cannam@127
|
350 }
|
cannam@127
|
351
|
cannam@127
|
352 static void sin2pi(REAL m, REAL n, N a);
|
cannam@127
|
353 static void cos2pi(REAL m, REAL n, N a)
|
cannam@127
|
354 {
|
cannam@127
|
355 N b;
|
cannam@127
|
356 if (m < 0) cos2pi(-m, n, a);
|
cannam@127
|
357 else if (m > n * 0.5) cos2pi(n - m, n, a);
|
cannam@127
|
358 else if (m > n * 0.25) {sin2pi(m - n * 0.25, n, a); neg(a);}
|
cannam@127
|
359 else if (m > n * 0.125) sin2pi(n * 0.25 - m, n, a);
|
cannam@127
|
360 else { by2pi(m, n, b); mcos(b, a); }
|
cannam@127
|
361 }
|
cannam@127
|
362
|
cannam@127
|
363 static void sin2pi(REAL m, REAL n, N a)
|
cannam@127
|
364 {
|
cannam@127
|
365 N b;
|
cannam@127
|
366 if (m < 0) {sin2pi(-m, n, a); neg(a);}
|
cannam@127
|
367 else if (m > n * 0.5) {sin2pi(n - m, n, a); neg(a);}
|
cannam@127
|
368 else if (m > n * 0.25) {cos2pi(m - n * 0.25, n, a);}
|
cannam@127
|
369 else if (m > n * 0.125) {cos2pi(n * 0.25 - m, n, a);}
|
cannam@127
|
370 else {by2pi(m, n, b); msin(b, a);}
|
cannam@127
|
371 }
|
cannam@127
|
372
|
cannam@127
|
373 /*----------------------------------------------------------------------*/
|
cannam@127
|
374 /* FFT stuff */
|
cannam@127
|
375
|
cannam@127
|
376 /* (r0 + i i0)(r1 + i i1) */
|
cannam@127
|
377 static void cmul(N r0, N i0, N r1, N i1, N r2, N i2)
|
cannam@127
|
378 {
|
cannam@127
|
379 N s, t, q;
|
cannam@127
|
380 mul(r0, r1, s);
|
cannam@127
|
381 mul(i0, i1, t);
|
cannam@127
|
382 sub(s, t, q);
|
cannam@127
|
383 mul(r0, i1, s);
|
cannam@127
|
384 mul(i0, r1, t);
|
cannam@127
|
385 add(s, t, i2);
|
cannam@127
|
386 cpy(q, r2);
|
cannam@127
|
387 }
|
cannam@127
|
388
|
cannam@127
|
389 /* (r0 - i i0)(r1 + i i1) */
|
cannam@127
|
390 static void cmulj(N r0, N i0, N r1, N i1, N r2, N i2)
|
cannam@127
|
391 {
|
cannam@127
|
392 N s, t, q;
|
cannam@127
|
393 mul(r0, r1, s);
|
cannam@127
|
394 mul(i0, i1, t);
|
cannam@127
|
395 add(s, t, q);
|
cannam@127
|
396 mul(r0, i1, s);
|
cannam@127
|
397 mul(i0, r1, t);
|
cannam@127
|
398 sub(s, t, i2);
|
cannam@127
|
399 cpy(q, r2);
|
cannam@127
|
400 }
|
cannam@127
|
401
|
cannam@127
|
402 static void mcexp(int m, int n, N r, N i)
|
cannam@127
|
403 {
|
cannam@127
|
404 static int cached_n = -1;
|
cannam@127
|
405 static N w[64][2];
|
cannam@127
|
406 int k, j;
|
cannam@127
|
407 if (n != cached_n) {
|
cannam@127
|
408 for (j = 1, k = 0; j < n; j += j, ++k) {
|
cannam@127
|
409 cos2pi(j, n, w[k][0]);
|
cannam@127
|
410 sin2pi(j, n, w[k][1]);
|
cannam@127
|
411 }
|
cannam@127
|
412 cached_n = n;
|
cannam@127
|
413 }
|
cannam@127
|
414
|
cannam@127
|
415 fromshort(1, r);
|
cannam@127
|
416 fromshort(0, i);
|
cannam@127
|
417 if (m > 0) {
|
cannam@127
|
418 for (k = 0; m; ++k, m >>= 1)
|
cannam@127
|
419 if (m & 1)
|
cannam@127
|
420 cmul(w[k][0], w[k][1], r, i, r, i);
|
cannam@127
|
421 } else {
|
cannam@127
|
422 m = -m;
|
cannam@127
|
423 for (k = 0; m; ++k, m >>= 1)
|
cannam@127
|
424 if (m & 1)
|
cannam@127
|
425 cmulj(w[k][0], w[k][1], r, i, r, i);
|
cannam@127
|
426 }
|
cannam@127
|
427 }
|
cannam@127
|
428
|
cannam@127
|
429 static void bitrev(int n, N *a)
|
cannam@127
|
430 {
|
cannam@127
|
431 int i, j, m;
|
cannam@127
|
432 for (i = j = 0; i < n - 1; ++i) {
|
cannam@127
|
433 if (i < j) {
|
cannam@127
|
434 N t;
|
cannam@127
|
435 cpy(a[2*i], t); cpy(a[2*j], a[2*i]); cpy(t, a[2*j]);
|
cannam@127
|
436 cpy(a[2*i+1], t); cpy(a[2*j+1], a[2*i+1]); cpy(t, a[2*j+1]);
|
cannam@127
|
437 }
|
cannam@127
|
438
|
cannam@127
|
439 /* bit reversed counter */
|
cannam@127
|
440 m = n; do { m >>= 1; j ^= m; } while (!(j & m));
|
cannam@127
|
441 }
|
cannam@127
|
442 }
|
cannam@127
|
443
|
cannam@127
|
444 static void fft0(int n, N *a, int sign)
|
cannam@127
|
445 {
|
cannam@127
|
446 int i, j, k;
|
cannam@127
|
447
|
cannam@127
|
448 bitrev(n, a);
|
cannam@127
|
449 for (i = 1; i < n; i = 2 * i) {
|
cannam@127
|
450 for (j = 0; j < i; ++j) {
|
cannam@127
|
451 N wr, wi;
|
cannam@127
|
452 mcexp(sign * (int)j, 2 * i, wr, wi);
|
cannam@127
|
453 for (k = j; k < n; k += 2 * i) {
|
cannam@127
|
454 N *a0 = a + 2 * k;
|
cannam@127
|
455 N *a1 = a0 + 2 * i;
|
cannam@127
|
456 N r0, i0, r1, i1, t0, t1, xr, xi;
|
cannam@127
|
457 cpy(a0[0], r0); cpy(a0[1], i0);
|
cannam@127
|
458 cpy(a1[0], r1); cpy(a1[1], i1);
|
cannam@127
|
459 mul(r1, wr, t0); mul(i1, wi, t1); sub(t0, t1, xr);
|
cannam@127
|
460 mul(r1, wi, t0); mul(i1, wr, t1); add(t0, t1, xi);
|
cannam@127
|
461 add(r0, xr, a0[0]); add(i0, xi, a0[1]);
|
cannam@127
|
462 sub(r0, xr, a1[0]); sub(i0, xi, a1[1]);
|
cannam@127
|
463 }
|
cannam@127
|
464 }
|
cannam@127
|
465 }
|
cannam@127
|
466 }
|
cannam@127
|
467
|
cannam@127
|
468 /* a[2*k]+i*a[2*k+1] = exp(2*pi*i*k^2/(2*n)) */
|
cannam@127
|
469 static void bluestein_sequence(int n, N *a)
|
cannam@127
|
470 {
|
cannam@127
|
471 int k, ksq, n2 = 2 * n;
|
cannam@127
|
472
|
cannam@127
|
473 ksq = 1; /* (-1)^2 */
|
cannam@127
|
474 for (k = 0; k < n; ++k) {
|
cannam@127
|
475 /* careful with overflow */
|
cannam@127
|
476 ksq = ksq + 2*k - 1; while (ksq > n2) ksq -= n2;
|
cannam@127
|
477 mcexp(ksq, n2, a[2*k], a[2*k+1]);
|
cannam@127
|
478 }
|
cannam@127
|
479 }
|
cannam@127
|
480
|
cannam@127
|
481 static int pow2_atleast(int x)
|
cannam@127
|
482 {
|
cannam@127
|
483 int h;
|
cannam@127
|
484 for (h = 1; h < x; h = 2 * h)
|
cannam@127
|
485 ;
|
cannam@127
|
486 return h;
|
cannam@127
|
487 }
|
cannam@127
|
488
|
cannam@127
|
489 static N *cached_bluestein_w = 0;
|
cannam@127
|
490 static N *cached_bluestein_y = 0;
|
cannam@127
|
491 static int cached_bluestein_n = -1;
|
cannam@127
|
492
|
cannam@127
|
493 static void bluestein(int n, N *a)
|
cannam@127
|
494 {
|
cannam@127
|
495 int nb = pow2_atleast(2 * n);
|
cannam@127
|
496 N *b = (N *)bench_malloc(2 * nb * sizeof(N));
|
cannam@127
|
497 N *w = cached_bluestein_w;
|
cannam@127
|
498 N *y = cached_bluestein_y;
|
cannam@127
|
499 N nbinv;
|
cannam@127
|
500 int i;
|
cannam@127
|
501
|
cannam@127
|
502 fromreal(1.0 / nb, nbinv); /* exact because nb = 2^k */
|
cannam@127
|
503
|
cannam@127
|
504 if (cached_bluestein_n != n) {
|
cannam@127
|
505 if (w) bench_free(w);
|
cannam@127
|
506 if (y) bench_free(y);
|
cannam@127
|
507 w = (N *)bench_malloc(2 * n * sizeof(N));
|
cannam@127
|
508 y = (N *)bench_malloc(2 * nb * sizeof(N));
|
cannam@127
|
509 cached_bluestein_n = n;
|
cannam@127
|
510 cached_bluestein_w = w;
|
cannam@127
|
511 cached_bluestein_y = y;
|
cannam@127
|
512
|
cannam@127
|
513 bluestein_sequence(n, w);
|
cannam@127
|
514 for (i = 0; i < 2*nb; ++i) cpy(zero, y[i]);
|
cannam@127
|
515
|
cannam@127
|
516 for (i = 0; i < n; ++i) {
|
cannam@127
|
517 cpy(w[2*i], y[2*i]);
|
cannam@127
|
518 cpy(w[2*i+1], y[2*i+1]);
|
cannam@127
|
519 }
|
cannam@127
|
520 for (i = 1; i < n; ++i) {
|
cannam@127
|
521 cpy(w[2*i], y[2*(nb-i)]);
|
cannam@127
|
522 cpy(w[2*i+1], y[2*(nb-i)+1]);
|
cannam@127
|
523 }
|
cannam@127
|
524
|
cannam@127
|
525 fft0(nb, y, -1);
|
cannam@127
|
526 }
|
cannam@127
|
527
|
cannam@127
|
528 for (i = 0; i < 2*nb; ++i) cpy(zero, b[i]);
|
cannam@127
|
529
|
cannam@127
|
530 for (i = 0; i < n; ++i)
|
cannam@127
|
531 cmulj(w[2*i], w[2*i+1], a[2*i], a[2*i+1], b[2*i], b[2*i+1]);
|
cannam@127
|
532
|
cannam@127
|
533 /* scaled convolution b * y */
|
cannam@127
|
534 fft0(nb, b, -1);
|
cannam@127
|
535
|
cannam@127
|
536 for (i = 0; i < nb; ++i)
|
cannam@127
|
537 cmul(b[2*i], b[2*i+1], y[2*i], y[2*i+1], b[2*i], b[2*i+1]);
|
cannam@127
|
538 fft0(nb, b, 1);
|
cannam@127
|
539
|
cannam@127
|
540 for (i = 0; i < n; ++i) {
|
cannam@127
|
541 cmulj(w[2*i], w[2*i+1], b[2*i], b[2*i+1], a[2*i], a[2*i+1]);
|
cannam@127
|
542 mul(nbinv, a[2*i], a[2*i]);
|
cannam@127
|
543 mul(nbinv, a[2*i+1], a[2*i+1]);
|
cannam@127
|
544 }
|
cannam@127
|
545
|
cannam@127
|
546 bench_free(b);
|
cannam@127
|
547 }
|
cannam@127
|
548
|
cannam@127
|
549 static void swapri(int n, N *a)
|
cannam@127
|
550 {
|
cannam@127
|
551 int i;
|
cannam@127
|
552 for (i = 0; i < n; ++i) {
|
cannam@127
|
553 N t;
|
cannam@127
|
554 cpy(a[2 * i], t);
|
cannam@127
|
555 cpy(a[2 * i + 1], a[2 * i]);
|
cannam@127
|
556 cpy(t, a[2 * i + 1]);
|
cannam@127
|
557 }
|
cannam@127
|
558 }
|
cannam@127
|
559
|
cannam@127
|
560 static void fft1(int n, N *a, int sign)
|
cannam@127
|
561 {
|
cannam@127
|
562 if (power_of_two(n)) {
|
cannam@127
|
563 fft0(n, a, sign);
|
cannam@127
|
564 } else {
|
cannam@127
|
565 if (sign == 1) swapri(n, a);
|
cannam@127
|
566 bluestein(n, a);
|
cannam@127
|
567 if (sign == 1) swapri(n, a);
|
cannam@127
|
568 }
|
cannam@127
|
569 }
|
cannam@127
|
570
|
cannam@127
|
571 static void fromrealv(int n, bench_complex *a, N *b)
|
cannam@127
|
572 {
|
cannam@127
|
573 int i;
|
cannam@127
|
574
|
cannam@127
|
575 for (i = 0; i < n; ++i) {
|
cannam@127
|
576 fromreal(c_re(a[i]), b[2 * i]);
|
cannam@127
|
577 fromreal(c_im(a[i]), b[2 * i + 1]);
|
cannam@127
|
578 }
|
cannam@127
|
579 }
|
cannam@127
|
580
|
cannam@127
|
581 static void compare(int n, N *a, N *b, double *err)
|
cannam@127
|
582 {
|
cannam@127
|
583 int i;
|
cannam@127
|
584 double e1, e2, einf;
|
cannam@127
|
585 double n1, n2, ninf;
|
cannam@127
|
586
|
cannam@127
|
587 e1 = e2 = einf = 0.0;
|
cannam@127
|
588 n1 = n2 = ninf = 0.0;
|
cannam@127
|
589
|
cannam@127
|
590 # define DO(x1, x2, xinf, var) { \
|
cannam@127
|
591 double d = var; \
|
cannam@127
|
592 if (d < 0) d = -d; \
|
cannam@127
|
593 x1 += d; x2 += d * d; if (d > xinf) xinf = d; \
|
cannam@127
|
594 }
|
cannam@127
|
595
|
cannam@127
|
596 for (i = 0; i < 2 * n; ++i) {
|
cannam@127
|
597 N dd;
|
cannam@127
|
598 sub(a[i], b[i], dd);
|
cannam@127
|
599 DO(n1, n2, ninf, toreal(a[i]));
|
cannam@127
|
600 DO(e1, e2, einf, toreal(dd));
|
cannam@127
|
601 }
|
cannam@127
|
602
|
cannam@127
|
603 # undef DO
|
cannam@127
|
604 err[0] = e1 / n1;
|
cannam@127
|
605 err[1] = sqrt(e2 / n2);
|
cannam@127
|
606 err[2] = einf / ninf;
|
cannam@127
|
607 }
|
cannam@127
|
608
|
cannam@127
|
609 void fftaccuracy(int n, bench_complex *a, bench_complex *ffta,
|
cannam@127
|
610 int sign, double err[6])
|
cannam@127
|
611 {
|
cannam@127
|
612 N *b = (N *)bench_malloc(2 * n * sizeof(N));
|
cannam@127
|
613 N *fftb = (N *)bench_malloc(2 * n * sizeof(N));
|
cannam@127
|
614 N mn, ninv;
|
cannam@127
|
615 int i;
|
cannam@127
|
616
|
cannam@127
|
617 fromreal(n, mn); inv(mn, ninv);
|
cannam@127
|
618
|
cannam@127
|
619 /* forward error */
|
cannam@127
|
620 fromrealv(n, a, b); fromrealv(n, ffta, fftb);
|
cannam@127
|
621 fft1(n, b, sign);
|
cannam@127
|
622 compare(n, b, fftb, err);
|
cannam@127
|
623
|
cannam@127
|
624 /* backward error */
|
cannam@127
|
625 fromrealv(n, a, b); fromrealv(n, ffta, fftb);
|
cannam@127
|
626 for (i = 0; i < 2 * n; ++i) mul(fftb[i], ninv, fftb[i]);
|
cannam@127
|
627 fft1(n, fftb, -sign);
|
cannam@127
|
628 compare(n, b, fftb, err + 3);
|
cannam@127
|
629
|
cannam@127
|
630 bench_free(fftb);
|
cannam@127
|
631 bench_free(b);
|
cannam@127
|
632 }
|
cannam@127
|
633
|
cannam@127
|
634 void fftaccuracy_done(void)
|
cannam@127
|
635 {
|
cannam@127
|
636 if (cached_bluestein_w) bench_free(cached_bluestein_w);
|
cannam@127
|
637 if (cached_bluestein_y) bench_free(cached_bluestein_y);
|
cannam@127
|
638 cached_bluestein_w = 0;
|
cannam@127
|
639 cached_bluestein_y = 0;
|
cannam@127
|
640 cached_bluestein_n = -1;
|
cannam@127
|
641 }
|