comparison src/fftw-3.3.5/rdft/scalar/r2cf/r2cf_20.c @ 42:2cd0e3b3e1fd

Current fftw source
author Chris Cannam
date Tue, 18 Oct 2016 13:40:26 +0100
parents
children
comparison
equal deleted inserted replaced
41:481f5f8c5634 42:2cd0e3b3e1fd
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Sat Jul 30 16:46:10 EDT 2016 */
23
24 #include "codelet-rdft.h"
25
26 #ifdef HAVE_FMA
27
28 /* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cf_20 -include r2cf.h */
29
30 /*
31 * This function contains 86 FP additions, 32 FP multiplications,
32 * (or, 58 additions, 4 multiplications, 28 fused multiply/add),
33 * 70 stack variables, 4 constants, and 40 memory accesses
34 */
35 #include "r2cf.h"
36
37 static void r2cf_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
38 {
39 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
40 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
41 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
42 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
43 {
44 INT i;
45 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
46 E T1i, T1c, T1a, T1o, T1m, T1h, T1b, T13, T1j, T1n;
47 {
48 E T3, T1d, TJ, TV, T1k, T16, T19, T1l, Ty, Ti, T12, TD, T1g, TR, TX;
49 E TK, Tt, TU, TW, TL, TE;
50 {
51 E T1, T2, TG, TH;
52 T1 = R0[0];
53 T2 = R0[WS(rs, 5)];
54 TG = R1[WS(rs, 2)];
55 TH = R1[WS(rs, 7)];
56 {
57 E T6, To, T17, Tx, T18, TC, Tj, T9, Tp, Tu, Td, T15, Tm, Tq, Te;
58 E Tf;
59 {
60 E TA, TB, T7, T8;
61 {
62 E T4, TF, TI, T5, Tv, Tw;
63 T4 = R0[WS(rs, 2)];
64 T3 = T1 - T2;
65 TF = T1 + T2;
66 T1d = TG - TH;
67 TI = TG + TH;
68 T5 = R0[WS(rs, 7)];
69 Tv = R1[WS(rs, 6)];
70 Tw = R1[WS(rs, 1)];
71 TJ = TF - TI;
72 TV = TF + TI;
73 T6 = T4 - T5;
74 To = T4 + T5;
75 T17 = Tw - Tv;
76 Tx = Tv + Tw;
77 }
78 TA = R1[WS(rs, 8)];
79 TB = R1[WS(rs, 3)];
80 T7 = R0[WS(rs, 8)];
81 T8 = R0[WS(rs, 3)];
82 {
83 E Tb, Tc, Tk, Tl;
84 Tb = R0[WS(rs, 4)];
85 T18 = TB - TA;
86 TC = TA + TB;
87 Tj = T7 + T8;
88 T9 = T7 - T8;
89 Tc = R0[WS(rs, 9)];
90 Tk = R1[0];
91 Tl = R1[WS(rs, 5)];
92 Tp = R1[WS(rs, 4)];
93 Tu = Tb + Tc;
94 Td = Tb - Tc;
95 T15 = Tl - Tk;
96 Tm = Tk + Tl;
97 Tq = R1[WS(rs, 9)];
98 Te = R0[WS(rs, 6)];
99 Tf = R0[WS(rs, 1)];
100 }
101 }
102 {
103 E Ta, Tr, Tz, T1e, T1f, Th, T14, Tg, TP, TQ;
104 Ta = T6 + T9;
105 T1k = T6 - T9;
106 T14 = Tq - Tp;
107 Tr = Tp + Tq;
108 Tz = Te + Tf;
109 Tg = Te - Tf;
110 T16 = T14 - T15;
111 T1e = T14 + T15;
112 T1f = T17 + T18;
113 T19 = T17 - T18;
114 Th = Td + Tg;
115 T1l = Td - Tg;
116 Ty = Tu - Tx;
117 TP = Tu + Tx;
118 Ti = Ta + Th;
119 T12 = Ta - Th;
120 TD = Tz - TC;
121 TQ = Tz + TC;
122 T1g = T1e + T1f;
123 T1i = T1e - T1f;
124 {
125 E TT, Tn, Ts, TS;
126 TT = Tj + Tm;
127 Tn = Tj - Tm;
128 Ts = To - Tr;
129 TS = To + Tr;
130 TR = TP - TQ;
131 TX = TP + TQ;
132 TK = Ts + Tn;
133 Tt = Tn - Ts;
134 TU = TS - TT;
135 TW = TS + TT;
136 }
137 }
138 }
139 }
140 Cr[WS(csr, 5)] = T3 + Ti;
141 Ci[WS(csi, 5)] = T1g - T1d;
142 TL = Ty + TD;
143 TE = Ty - TD;
144 {
145 E TY, T10, TM, TO, T11, TZ, TN;
146 TY = TW + TX;
147 T10 = TW - TX;
148 Ci[WS(csi, 2)] = KP951056516 * (FMA(KP618033988, Tt, TE));
149 Ci[WS(csi, 6)] = KP951056516 * (FNMS(KP618033988, TE, Tt));
150 Ci[WS(csi, 4)] = KP951056516 * (FMA(KP618033988, TR, TU));
151 Ci[WS(csi, 8)] = -(KP951056516 * (FNMS(KP618033988, TU, TR)));
152 TM = TK + TL;
153 TO = TK - TL;
154 T1c = FNMS(KP618033988, T16, T19);
155 T1a = FMA(KP618033988, T19, T16);
156 Cr[0] = TV + TY;
157 TZ = FNMS(KP250000000, TY, TV);
158 Cr[WS(csr, 10)] = TJ + TM;
159 TN = FNMS(KP250000000, TM, TJ);
160 Cr[WS(csr, 8)] = FNMS(KP559016994, T10, TZ);
161 Cr[WS(csr, 4)] = FMA(KP559016994, T10, TZ);
162 Cr[WS(csr, 6)] = FMA(KP559016994, TO, TN);
163 Cr[WS(csr, 2)] = FNMS(KP559016994, TO, TN);
164 T11 = FNMS(KP250000000, Ti, T3);
165 T1o = FNMS(KP618033988, T1k, T1l);
166 T1m = FMA(KP618033988, T1l, T1k);
167 T1h = FMA(KP250000000, T1g, T1d);
168 T1b = FNMS(KP559016994, T12, T11);
169 T13 = FMA(KP559016994, T12, T11);
170 }
171 }
172 Cr[WS(csr, 3)] = FNMS(KP951056516, T1c, T1b);
173 Cr[WS(csr, 7)] = FMA(KP951056516, T1c, T1b);
174 Cr[WS(csr, 1)] = FMA(KP951056516, T1a, T13);
175 Cr[WS(csr, 9)] = FNMS(KP951056516, T1a, T13);
176 T1j = FNMS(KP559016994, T1i, T1h);
177 T1n = FMA(KP559016994, T1i, T1h);
178 Ci[WS(csi, 3)] = FNMS(KP951056516, T1o, T1n);
179 Ci[WS(csi, 7)] = FMA(KP951056516, T1o, T1n);
180 Ci[WS(csi, 9)] = FMS(KP951056516, T1m, T1j);
181 Ci[WS(csi, 1)] = -(FMA(KP951056516, T1m, T1j));
182 }
183 }
184 }
185
186 static const kr2c_desc desc = { 20, "r2cf_20", {58, 4, 28, 0}, &GENUS };
187
188 void X(codelet_r2cf_20) (planner *p) {
189 X(kr2c_register) (p, r2cf_20, &desc);
190 }
191
192 #else /* HAVE_FMA */
193
194 /* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cf_20 -include r2cf.h */
195
196 /*
197 * This function contains 86 FP additions, 24 FP multiplications,
198 * (or, 74 additions, 12 multiplications, 12 fused multiply/add),
199 * 51 stack variables, 4 constants, and 40 memory accesses
200 */
201 #include "r2cf.h"
202
203 static void r2cf_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
204 {
205 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
206 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
207 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
208 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
209 {
210 INT i;
211 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
212 E T3, T1m, TF, T17, Ts, TM, TN, Tz, Ta, Th, Ti, T1g, T1h, T1k, T10;
213 E T13, T19, TG, TH, TI, T1d, T1e, T1j, TT, TW, T18;
214 {
215 E T1, T2, T15, TD, TE, T16;
216 T1 = R0[0];
217 T2 = R0[WS(rs, 5)];
218 T15 = T1 + T2;
219 TD = R1[WS(rs, 7)];
220 TE = R1[WS(rs, 2)];
221 T16 = TE + TD;
222 T3 = T1 - T2;
223 T1m = T15 + T16;
224 TF = TD - TE;
225 T17 = T15 - T16;
226 }
227 {
228 E T6, TU, Tv, T12, Ty, TZ, T9, TR, Td, TY, To, TS, Tr, TV, Tg;
229 E T11;
230 {
231 E T4, T5, Tt, Tu;
232 T4 = R0[WS(rs, 2)];
233 T5 = R0[WS(rs, 7)];
234 T6 = T4 - T5;
235 TU = T4 + T5;
236 Tt = R1[WS(rs, 8)];
237 Tu = R1[WS(rs, 3)];
238 Tv = Tt - Tu;
239 T12 = Tt + Tu;
240 }
241 {
242 E Tw, Tx, T7, T8;
243 Tw = R1[WS(rs, 6)];
244 Tx = R1[WS(rs, 1)];
245 Ty = Tw - Tx;
246 TZ = Tw + Tx;
247 T7 = R0[WS(rs, 8)];
248 T8 = R0[WS(rs, 3)];
249 T9 = T7 - T8;
250 TR = T7 + T8;
251 }
252 {
253 E Tb, Tc, Tm, Tn;
254 Tb = R0[WS(rs, 4)];
255 Tc = R0[WS(rs, 9)];
256 Td = Tb - Tc;
257 TY = Tb + Tc;
258 Tm = R1[0];
259 Tn = R1[WS(rs, 5)];
260 To = Tm - Tn;
261 TS = Tm + Tn;
262 }
263 {
264 E Tp, Tq, Te, Tf;
265 Tp = R1[WS(rs, 4)];
266 Tq = R1[WS(rs, 9)];
267 Tr = Tp - Tq;
268 TV = Tp + Tq;
269 Te = R0[WS(rs, 6)];
270 Tf = R0[WS(rs, 1)];
271 Tg = Te - Tf;
272 T11 = Te + Tf;
273 }
274 Ts = To - Tr;
275 TM = T6 - T9;
276 TN = Td - Tg;
277 Tz = Tv - Ty;
278 Ta = T6 + T9;
279 Th = Td + Tg;
280 Ti = Ta + Th;
281 T1g = TY + TZ;
282 T1h = T11 + T12;
283 T1k = T1g + T1h;
284 T10 = TY - TZ;
285 T13 = T11 - T12;
286 T19 = T10 + T13;
287 TG = Tr + To;
288 TH = Ty + Tv;
289 TI = TG + TH;
290 T1d = TU + TV;
291 T1e = TR + TS;
292 T1j = T1d + T1e;
293 TT = TR - TS;
294 TW = TU - TV;
295 T18 = TW + TT;
296 }
297 Cr[WS(csr, 5)] = T3 + Ti;
298 Ci[WS(csi, 5)] = TF - TI;
299 {
300 E TX, T14, T1f, T1i;
301 TX = TT - TW;
302 T14 = T10 - T13;
303 Ci[WS(csi, 6)] = FNMS(KP587785252, T14, KP951056516 * TX);
304 Ci[WS(csi, 2)] = FMA(KP587785252, TX, KP951056516 * T14);
305 T1f = T1d - T1e;
306 T1i = T1g - T1h;
307 Ci[WS(csi, 8)] = FNMS(KP951056516, T1i, KP587785252 * T1f);
308 Ci[WS(csi, 4)] = FMA(KP951056516, T1f, KP587785252 * T1i);
309 }
310 {
311 E T1l, T1n, T1o, T1c, T1a, T1b;
312 T1l = KP559016994 * (T1j - T1k);
313 T1n = T1j + T1k;
314 T1o = FNMS(KP250000000, T1n, T1m);
315 Cr[WS(csr, 4)] = T1l + T1o;
316 Cr[0] = T1m + T1n;
317 Cr[WS(csr, 8)] = T1o - T1l;
318 T1c = KP559016994 * (T18 - T19);
319 T1a = T18 + T19;
320 T1b = FNMS(KP250000000, T1a, T17);
321 Cr[WS(csr, 2)] = T1b - T1c;
322 Cr[WS(csr, 10)] = T17 + T1a;
323 Cr[WS(csr, 6)] = T1c + T1b;
324 }
325 {
326 E TA, TC, Tl, TB, Tj, Tk;
327 TA = FMA(KP951056516, Ts, KP587785252 * Tz);
328 TC = FNMS(KP587785252, Ts, KP951056516 * Tz);
329 Tj = KP559016994 * (Ta - Th);
330 Tk = FNMS(KP250000000, Ti, T3);
331 Tl = Tj + Tk;
332 TB = Tk - Tj;
333 Cr[WS(csr, 9)] = Tl - TA;
334 Cr[WS(csr, 7)] = TB + TC;
335 Cr[WS(csr, 1)] = Tl + TA;
336 Cr[WS(csr, 3)] = TB - TC;
337 }
338 {
339 E TO, TQ, TL, TP, TJ, TK;
340 TO = FMA(KP951056516, TM, KP587785252 * TN);
341 TQ = FNMS(KP587785252, TM, KP951056516 * TN);
342 TJ = FMA(KP250000000, TI, TF);
343 TK = KP559016994 * (TH - TG);
344 TL = TJ + TK;
345 TP = TK - TJ;
346 Ci[WS(csi, 1)] = TL - TO;
347 Ci[WS(csi, 7)] = TQ + TP;
348 Ci[WS(csi, 9)] = TO + TL;
349 Ci[WS(csi, 3)] = TP - TQ;
350 }
351 }
352 }
353 }
354
355 static const kr2c_desc desc = { 20, "r2cf_20", {74, 12, 12, 0}, &GENUS };
356
357 void X(codelet_r2cf_20) (planner *p) {
358 X(kr2c_register) (p, r2cf_20, &desc);
359 }
360
361 #endif /* HAVE_FMA */