Mercurial > hg > js-dsp-test
comparison fft/fftw/fftw-3.3.4/rdft/scalar/r2cb/r2cbIII_20.c @ 19:26056e866c29
Add FFTW to comparison table
author | Chris Cannam |
---|---|
date | Tue, 06 Oct 2015 13:08:39 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
18:8db794ca3e0b | 19:26056e866c29 |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-14 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 /* This file was automatically generated --- DO NOT EDIT */ | |
22 /* Generated on Tue Mar 4 13:50:36 EST 2014 */ | |
23 | |
24 #include "codelet-rdft.h" | |
25 | |
26 #ifdef HAVE_FMA | |
27 | |
28 /* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -name r2cbIII_20 -dft-III -include r2cbIII.h */ | |
29 | |
30 /* | |
31 * This function contains 94 FP additions, 56 FP multiplications, | |
32 * (or, 58 additions, 20 multiplications, 36 fused multiply/add), | |
33 * 59 stack variables, 6 constants, and 40 memory accesses | |
34 */ | |
35 #include "r2cbIII.h" | |
36 | |
37 static void r2cbIII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) | |
38 { | |
39 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); | |
40 DK(KP951056516, +0.951056516295153572116439333379382143405698634); | |
41 DK(KP559016994, +0.559016994374947424102293417182819058860154590); | |
42 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); | |
43 DK(KP250000000, +0.250000000000000000000000000000000000000000000); | |
44 DK(KP618033988, +0.618033988749894848204586834365638117720309180); | |
45 { | |
46 INT i; | |
47 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) { | |
48 E TZ, TD, TW, Tw, Tt, TF, T1f, T1b; | |
49 { | |
50 E T1l, Tk, T9, Tj, Ta, TV, TI, Ts, TU, T1t, T11, Tx, T13, TC, T1a; | |
51 E T1i, Th, Tv, Ty; | |
52 { | |
53 E TQ, TS, Tr, Tm, Tn; | |
54 { | |
55 E T1, T5, T6, T2, T3, T7, TY; | |
56 T1 = Cr[WS(csr, 2)]; | |
57 T5 = Cr[WS(csr, 9)]; | |
58 T6 = Cr[WS(csr, 5)]; | |
59 T2 = Cr[WS(csr, 6)]; | |
60 T3 = Cr[WS(csr, 1)]; | |
61 TQ = Ci[WS(csi, 2)]; | |
62 T7 = T5 + T6; | |
63 TY = T5 - T6; | |
64 { | |
65 E T4, TX, T8, Tp, Tq; | |
66 T4 = T2 + T3; | |
67 TX = T2 - T3; | |
68 Tp = Ci[WS(csi, 5)]; | |
69 Tq = Ci[WS(csi, 9)]; | |
70 T1l = FNMS(KP618033988, TX, TY); | |
71 TZ = FMA(KP618033988, TY, TX); | |
72 Tk = T4 - T7; | |
73 T8 = T4 + T7; | |
74 TS = Tp + Tq; | |
75 Tr = Tp - Tq; | |
76 T9 = T1 + T8; | |
77 Tj = FNMS(KP250000000, T8, T1); | |
78 Tm = Ci[WS(csi, 6)]; | |
79 Tn = Ci[WS(csi, 1)]; | |
80 } | |
81 } | |
82 { | |
83 E Tb, T19, Tg, Tc; | |
84 Ta = Cr[WS(csr, 7)]; | |
85 { | |
86 E Te, Tf, To, TR, TT; | |
87 Te = Cr[0]; | |
88 Tf = Cr[WS(csr, 4)]; | |
89 To = Tm + Tn; | |
90 TR = Tm - Tn; | |
91 Tb = Cr[WS(csr, 3)]; | |
92 T19 = Te - Tf; | |
93 Tg = Te + Tf; | |
94 TT = TR - TS; | |
95 TV = TR + TS; | |
96 TI = FNMS(KP618033988, To, Tr); | |
97 Ts = FMA(KP618033988, Tr, To); | |
98 TU = FNMS(KP250000000, TT, TQ); | |
99 T1t = TT + TQ; | |
100 Tc = Cr[WS(csr, 8)]; | |
101 } | |
102 T11 = Ci[WS(csi, 7)]; | |
103 { | |
104 E TA, TB, Td, T18; | |
105 TA = Ci[WS(csi, 4)]; | |
106 TB = Ci[0]; | |
107 Td = Tb + Tc; | |
108 T18 = Tb - Tc; | |
109 Tx = Ci[WS(csi, 3)]; | |
110 T13 = TB + TA; | |
111 TC = TA - TB; | |
112 T1a = FMA(KP618033988, T19, T18); | |
113 T1i = FNMS(KP618033988, T18, T19); | |
114 Th = Td + Tg; | |
115 Tv = Td - Tg; | |
116 Ty = Ci[WS(csi, 8)]; | |
117 } | |
118 } | |
119 } | |
120 { | |
121 E Tu, T1w, T16, TL, T15, T1u; | |
122 { | |
123 E Ti, T12, Tz, T14; | |
124 Tu = FNMS(KP250000000, Th, Ta); | |
125 Ti = Ta + Th; | |
126 T12 = Tx - Ty; | |
127 Tz = Tx + Ty; | |
128 T1w = T9 - Ti; | |
129 T14 = T12 - T13; | |
130 T16 = T12 + T13; | |
131 TL = FNMS(KP618033988, Tz, TC); | |
132 TD = FMA(KP618033988, TC, Tz); | |
133 T15 = FNMS(KP250000000, T14, T11); | |
134 T1u = T14 + T11; | |
135 R0[0] = KP2_000000000 * (T9 + Ti); | |
136 } | |
137 { | |
138 E Tl, TJ, TN, T1q, T1m, TK, T1h, T17, TH, T1k, T1v; | |
139 Tl = FMA(KP559016994, Tk, Tj); | |
140 TH = FNMS(KP559016994, Tk, Tj); | |
141 T1k = FNMS(KP559016994, TV, TU); | |
142 TW = FMA(KP559016994, TV, TU); | |
143 R0[WS(rs, 5)] = KP2_000000000 * (T1u - T1t); | |
144 T1v = T1t + T1u; | |
145 TJ = FNMS(KP951056516, TI, TH); | |
146 TN = FMA(KP951056516, TI, TH); | |
147 T1q = FMA(KP951056516, T1l, T1k); | |
148 T1m = FNMS(KP951056516, T1l, T1k); | |
149 R1[WS(rs, 7)] = KP1_414213562 * (T1w + T1v); | |
150 R1[WS(rs, 2)] = KP1_414213562 * (T1v - T1w); | |
151 Tw = FMA(KP559016994, Tv, Tu); | |
152 TK = FNMS(KP559016994, Tv, Tu); | |
153 T1h = FNMS(KP559016994, T16, T15); | |
154 T17 = FMA(KP559016994, T16, T15); | |
155 { | |
156 E TM, TO, T1j, T1r; | |
157 TM = FMA(KP951056516, TL, TK); | |
158 TO = FNMS(KP951056516, TL, TK); | |
159 T1j = FMA(KP951056516, T1i, T1h); | |
160 T1r = FNMS(KP951056516, T1i, T1h); | |
161 Tt = FNMS(KP951056516, Ts, Tl); | |
162 TF = FMA(KP951056516, Ts, Tl); | |
163 { | |
164 E T1n, T1p, T1s, T1o; | |
165 T1n = TN - TO; | |
166 R0[WS(rs, 6)] = -(KP2_000000000 * (TN + TO)); | |
167 T1p = TM - TJ; | |
168 R0[WS(rs, 4)] = KP2_000000000 * (TJ + TM); | |
169 T1s = T1q + T1r; | |
170 R0[WS(rs, 9)] = KP2_000000000 * (T1r - T1q); | |
171 T1o = T1m + T1j; | |
172 R0[WS(rs, 1)] = KP2_000000000 * (T1j - T1m); | |
173 R1[WS(rs, 6)] = KP1_414213562 * (T1p + T1s); | |
174 R1[WS(rs, 1)] = KP1_414213562 * (T1p - T1s); | |
175 R1[WS(rs, 3)] = KP1_414213562 * (T1n + T1o); | |
176 R1[WS(rs, 8)] = KP1_414213562 * (T1n - T1o); | |
177 T1f = FMA(KP951056516, T1a, T17); | |
178 T1b = FNMS(KP951056516, T1a, T17); | |
179 } | |
180 } | |
181 } | |
182 } | |
183 } | |
184 { | |
185 E TE, TG, T10, T1e; | |
186 TE = FMA(KP951056516, TD, Tw); | |
187 TG = FNMS(KP951056516, TD, Tw); | |
188 T10 = FMA(KP951056516, TZ, TW); | |
189 T1e = FNMS(KP951056516, TZ, TW); | |
190 { | |
191 E T1d, TP, T1g, T1c; | |
192 T1d = TF - TG; | |
193 R0[WS(rs, 2)] = -(KP2_000000000 * (TF + TG)); | |
194 TP = Tt - TE; | |
195 R0[WS(rs, 8)] = KP2_000000000 * (Tt + TE); | |
196 T1g = T1e + T1f; | |
197 R0[WS(rs, 7)] = KP2_000000000 * (T1e - T1f); | |
198 T1c = T10 + T1b; | |
199 R0[WS(rs, 3)] = KP2_000000000 * (T10 - T1b); | |
200 R1[WS(rs, 9)] = -(KP1_414213562 * (T1d + T1g)); | |
201 R1[WS(rs, 4)] = KP1_414213562 * (T1d - T1g); | |
202 R1[WS(rs, 5)] = -(KP1_414213562 * (TP + T1c)); | |
203 R1[0] = KP1_414213562 * (TP - T1c); | |
204 } | |
205 } | |
206 } | |
207 } | |
208 } | |
209 | |
210 static const kr2c_desc desc = { 20, "r2cbIII_20", {58, 20, 36, 0}, &GENUS }; | |
211 | |
212 void X(codelet_r2cbIII_20) (planner *p) { | |
213 X(kr2c_register) (p, r2cbIII_20, &desc); | |
214 } | |
215 | |
216 #else /* HAVE_FMA */ | |
217 | |
218 /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -name r2cbIII_20 -dft-III -include r2cbIII.h */ | |
219 | |
220 /* | |
221 * This function contains 94 FP additions, 44 FP multiplications, | |
222 * (or, 82 additions, 32 multiplications, 12 fused multiply/add), | |
223 * 43 stack variables, 6 constants, and 40 memory accesses | |
224 */ | |
225 #include "r2cbIII.h" | |
226 | |
227 static void r2cbIII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) | |
228 { | |
229 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); | |
230 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); | |
231 DK(KP250000000, +0.250000000000000000000000000000000000000000000); | |
232 DK(KP951056516, +0.951056516295153572116439333379382143405698634); | |
233 DK(KP587785252, +0.587785252292473129168705954639072768597652438); | |
234 DK(KP559016994, +0.559016994374947424102293417182819058860154590); | |
235 { | |
236 INT i; | |
237 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) { | |
238 E T1, Tj, T1k, T13, T8, Tk, T17, Ts, T16, TI, T18, T19, Ta, Tu, T1i; | |
239 E TS, Th, Tv, TX, TD, TV, TL, TW, TY; | |
240 { | |
241 E T7, T12, T4, T11; | |
242 T1 = Cr[WS(csr, 2)]; | |
243 { | |
244 E T5, T6, T2, T3; | |
245 T5 = Cr[WS(csr, 9)]; | |
246 T6 = Cr[WS(csr, 5)]; | |
247 T7 = T5 + T6; | |
248 T12 = T5 - T6; | |
249 T2 = Cr[WS(csr, 6)]; | |
250 T3 = Cr[WS(csr, 1)]; | |
251 T4 = T2 + T3; | |
252 T11 = T2 - T3; | |
253 } | |
254 Tj = KP559016994 * (T4 - T7); | |
255 T1k = FNMS(KP951056516, T12, KP587785252 * T11); | |
256 T13 = FMA(KP951056516, T11, KP587785252 * T12); | |
257 T8 = T4 + T7; | |
258 Tk = FNMS(KP250000000, T8, T1); | |
259 } | |
260 { | |
261 E Tr, T15, To, T14; | |
262 T17 = Ci[WS(csi, 2)]; | |
263 { | |
264 E Tp, Tq, Tm, Tn; | |
265 Tp = Ci[WS(csi, 5)]; | |
266 Tq = Ci[WS(csi, 9)]; | |
267 Tr = Tp - Tq; | |
268 T15 = Tp + Tq; | |
269 Tm = Ci[WS(csi, 6)]; | |
270 Tn = Ci[WS(csi, 1)]; | |
271 To = Tm + Tn; | |
272 T14 = Tm - Tn; | |
273 } | |
274 Ts = FMA(KP951056516, To, KP587785252 * Tr); | |
275 T16 = KP559016994 * (T14 + T15); | |
276 TI = FNMS(KP951056516, Tr, KP587785252 * To); | |
277 T18 = T14 - T15; | |
278 T19 = FNMS(KP250000000, T18, T17); | |
279 } | |
280 { | |
281 E Tg, TR, Td, TQ; | |
282 Ta = Cr[WS(csr, 7)]; | |
283 { | |
284 E Te, Tf, Tb, Tc; | |
285 Te = Cr[0]; | |
286 Tf = Cr[WS(csr, 4)]; | |
287 Tg = Te + Tf; | |
288 TR = Te - Tf; | |
289 Tb = Cr[WS(csr, 3)]; | |
290 Tc = Cr[WS(csr, 8)]; | |
291 Td = Tb + Tc; | |
292 TQ = Tb - Tc; | |
293 } | |
294 Tu = KP559016994 * (Td - Tg); | |
295 T1i = FNMS(KP951056516, TR, KP587785252 * TQ); | |
296 TS = FMA(KP951056516, TQ, KP587785252 * TR); | |
297 Th = Td + Tg; | |
298 Tv = FNMS(KP250000000, Th, Ta); | |
299 } | |
300 { | |
301 E TC, TU, Tz, TT; | |
302 TX = Ci[WS(csi, 7)]; | |
303 { | |
304 E TA, TB, Tx, Ty; | |
305 TA = Ci[WS(csi, 4)]; | |
306 TB = Ci[0]; | |
307 TC = TA - TB; | |
308 TU = TB + TA; | |
309 Tx = Ci[WS(csi, 3)]; | |
310 Ty = Ci[WS(csi, 8)]; | |
311 Tz = Tx + Ty; | |
312 TT = Ty - Tx; | |
313 } | |
314 TD = FMA(KP951056516, Tz, KP587785252 * TC); | |
315 TV = KP559016994 * (TT - TU); | |
316 TL = FNMS(KP587785252, Tz, KP951056516 * TC); | |
317 TW = TT + TU; | |
318 TY = FMA(KP250000000, TW, TX); | |
319 } | |
320 { | |
321 E T9, Ti, T1w, T1t, T1u, T1v; | |
322 T9 = T1 + T8; | |
323 Ti = Ta + Th; | |
324 T1w = T9 - Ti; | |
325 T1t = T18 + T17; | |
326 T1u = TX - TW; | |
327 T1v = T1t + T1u; | |
328 R0[0] = KP2_000000000 * (T9 + Ti); | |
329 R0[WS(rs, 5)] = KP2_000000000 * (T1u - T1t); | |
330 R1[WS(rs, 2)] = KP1_414213562 * (T1v - T1w); | |
331 R1[WS(rs, 7)] = KP1_414213562 * (T1w + T1v); | |
332 } | |
333 { | |
334 E TJ, TO, T1m, T1q, TM, TN, T1j, T1r; | |
335 { | |
336 E TH, T1l, TK, T1h; | |
337 TH = Tk - Tj; | |
338 TJ = TH + TI; | |
339 TO = TH - TI; | |
340 T1l = T19 - T16; | |
341 T1m = T1k + T1l; | |
342 T1q = T1l - T1k; | |
343 TK = Tv - Tu; | |
344 TM = TK + TL; | |
345 TN = TL - TK; | |
346 T1h = TV + TY; | |
347 T1j = T1h - T1i; | |
348 T1r = T1i + T1h; | |
349 } | |
350 R0[WS(rs, 4)] = KP2_000000000 * (TJ + TM); | |
351 R0[WS(rs, 6)] = KP2_000000000 * (TN - TO); | |
352 R0[WS(rs, 9)] = KP2_000000000 * (T1r - T1q); | |
353 R0[WS(rs, 1)] = KP2_000000000 * (T1j - T1m); | |
354 { | |
355 E T1p, T1s, T1n, T1o; | |
356 T1p = TM - TJ; | |
357 T1s = T1q + T1r; | |
358 R1[WS(rs, 1)] = KP1_414213562 * (T1p - T1s); | |
359 R1[WS(rs, 6)] = KP1_414213562 * (T1p + T1s); | |
360 T1n = TO + TN; | |
361 T1o = T1m + T1j; | |
362 R1[WS(rs, 8)] = KP1_414213562 * (T1n - T1o); | |
363 R1[WS(rs, 3)] = KP1_414213562 * (T1n + T1o); | |
364 } | |
365 } | |
366 { | |
367 E Tt, TG, T1b, T1f, TE, TF, T10, T1e; | |
368 { | |
369 E Tl, T1a, Tw, TZ; | |
370 Tl = Tj + Tk; | |
371 Tt = Tl - Ts; | |
372 TG = Tl + Ts; | |
373 T1a = T16 + T19; | |
374 T1b = T13 + T1a; | |
375 T1f = T1a - T13; | |
376 Tw = Tu + Tv; | |
377 TE = Tw + TD; | |
378 TF = TD - Tw; | |
379 TZ = TV - TY; | |
380 T10 = TS + TZ; | |
381 T1e = TZ - TS; | |
382 } | |
383 R0[WS(rs, 8)] = KP2_000000000 * (Tt + TE); | |
384 R0[WS(rs, 2)] = KP2_000000000 * (TF - TG); | |
385 R0[WS(rs, 7)] = KP2_000000000 * (T1f + T1e); | |
386 R0[WS(rs, 3)] = KP2_000000000 * (T1b + T10); | |
387 { | |
388 E T1d, T1g, TP, T1c; | |
389 T1d = TG + TF; | |
390 T1g = T1e - T1f; | |
391 R1[WS(rs, 4)] = KP1_414213562 * (T1d + T1g); | |
392 R1[WS(rs, 9)] = KP1_414213562 * (T1g - T1d); | |
393 TP = Tt - TE; | |
394 T1c = T10 - T1b; | |
395 R1[0] = KP1_414213562 * (TP + T1c); | |
396 R1[WS(rs, 5)] = KP1_414213562 * (T1c - TP); | |
397 } | |
398 } | |
399 } | |
400 } | |
401 } | |
402 | |
403 static const kr2c_desc desc = { 20, "r2cbIII_20", {82, 32, 12, 0}, &GENUS }; | |
404 | |
405 void X(codelet_r2cbIII_20) (planner *p) { | |
406 X(kr2c_register) (p, r2cbIII_20, &desc); | |
407 } | |
408 | |
409 #endif /* HAVE_FMA */ |