comparison src/fftw-3.3.3/rdft/scalar/r2cb/hc2cbdft_16.c @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Sun Nov 25 07:42:05 EST 2012 */
23
24 #include "codelet-rdft.h"
25
26 #ifdef HAVE_FMA
27
28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include hc2cb.h */
29
30 /*
31 * This function contains 206 FP additions, 100 FP multiplications,
32 * (or, 136 additions, 30 multiplications, 70 fused multiply/add),
33 * 97 stack variables, 3 constants, and 64 memory accesses
34 */
35 #include "hc2cb.h"
36
37 static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
42 {
43 INT m;
44 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
45 E T3w, T3z, T2Y, T3D, T3x, T3m, T3u, T3C, T3y, T3o, T3k, T3E, T3A;
46 {
47 E T20, Tf, T3Q, T32, T3V, T3f, T2a, TN, T2f, T1m, T3G, T2G, T3L, T2T, T26;
48 E T1F, T3M, T2N, T3H, T2W, T25, Tu, T1n, T1o, T3R, T3i, T2g, T1a, T21, T1y;
49 E T3W, T39;
50 {
51 E T2R, T1B, T2S, T1E;
52 {
53 E T1e, T3, T1C, TA, Tx, T6, T1D, T1h, Td, T1A, TL, T1k, Ta, TC, TF;
54 E T1z;
55 {
56 E T4, T5, T1f, T1g;
57 {
58 E T1, T2, Ty, Tz;
59 T1 = Rp[0];
60 T2 = Rm[WS(rs, 7)];
61 Ty = Ip[0];
62 Tz = Im[WS(rs, 7)];
63 T4 = Rp[WS(rs, 4)];
64 T1e = T1 - T2;
65 T3 = T1 + T2;
66 T1C = Ty - Tz;
67 TA = Ty + Tz;
68 T5 = Rm[WS(rs, 3)];
69 }
70 T1f = Ip[WS(rs, 4)];
71 T1g = Im[WS(rs, 3)];
72 {
73 E Tb, Tc, TI, TJ;
74 Tb = Rm[WS(rs, 1)];
75 Tx = T4 - T5;
76 T6 = T4 + T5;
77 T1D = T1f - T1g;
78 T1h = T1f + T1g;
79 Tc = Rp[WS(rs, 6)];
80 TI = Im[WS(rs, 1)];
81 TJ = Ip[WS(rs, 6)];
82 {
83 E T8, TH, TK, T9, TD, TE;
84 T8 = Rp[WS(rs, 2)];
85 Td = Tb + Tc;
86 TH = Tb - Tc;
87 T1A = TJ - TI;
88 TK = TI + TJ;
89 T9 = Rm[WS(rs, 5)];
90 TD = Ip[WS(rs, 2)];
91 TE = Im[WS(rs, 5)];
92 TL = TH + TK;
93 T1k = TH - TK;
94 Ta = T8 + T9;
95 TC = T8 - T9;
96 TF = TD + TE;
97 T1z = TD - TE;
98 }
99 }
100 }
101 {
102 E T2E, TB, T1l, T1i, T3d, T3e, TM, T2F;
103 {
104 E T7, TG, Te, T30, T31, T1j;
105 T2E = T3 - T6;
106 T7 = T3 + T6;
107 T1j = TC - TF;
108 TG = TC + TF;
109 Te = Ta + Td;
110 T2R = Ta - Td;
111 TB = Tx + TA;
112 T30 = TA - Tx;
113 T31 = T1j - T1k;
114 T1l = T1j + T1k;
115 T1i = T1e - T1h;
116 T3d = T1e + T1h;
117 T20 = T7 - Te;
118 Tf = T7 + Te;
119 T3Q = FNMS(KP707106781, T31, T30);
120 T32 = FMA(KP707106781, T31, T30);
121 T3e = TG + TL;
122 TM = TG - TL;
123 }
124 T3V = FMA(KP707106781, T3e, T3d);
125 T3f = FNMS(KP707106781, T3e, T3d);
126 T2a = FNMS(KP707106781, TM, TB);
127 TN = FMA(KP707106781, TM, TB);
128 T2F = T1A - T1z;
129 T1B = T1z + T1A;
130 T2f = FNMS(KP707106781, T1l, T1i);
131 T1m = FMA(KP707106781, T1l, T1i);
132 T3G = T2E - T2F;
133 T2G = T2E + T2F;
134 T2S = T1C - T1D;
135 T1E = T1C + T1D;
136 }
137 }
138 {
139 E T34, TS, T2H, Tm, T1u, T2I, T33, TX, Tq, T14, Tp, T1v, T12, Tr, T15;
140 E T16;
141 {
142 E Tj, TT, Ti, T1s, TR, Tk, TU, TV;
143 {
144 E Tg, Th, TP, TQ;
145 Tg = Rp[WS(rs, 1)];
146 T3L = T2S - T2R;
147 T2T = T2R + T2S;
148 T26 = T1E - T1B;
149 T1F = T1B + T1E;
150 Th = Rm[WS(rs, 6)];
151 TP = Ip[WS(rs, 1)];
152 TQ = Im[WS(rs, 6)];
153 Tj = Rp[WS(rs, 5)];
154 TT = Tg - Th;
155 Ti = Tg + Th;
156 T1s = TP - TQ;
157 TR = TP + TQ;
158 Tk = Rm[WS(rs, 2)];
159 TU = Ip[WS(rs, 5)];
160 TV = Im[WS(rs, 2)];
161 }
162 {
163 E Tn, To, T10, T11;
164 Tn = Rm[0];
165 {
166 E TO, Tl, T1t, TW;
167 TO = Tj - Tk;
168 Tl = Tj + Tk;
169 T1t = TU - TV;
170 TW = TU + TV;
171 T34 = TR - TO;
172 TS = TO + TR;
173 T2H = Ti - Tl;
174 Tm = Ti + Tl;
175 T1u = T1s + T1t;
176 T2I = T1s - T1t;
177 T33 = TT + TW;
178 TX = TT - TW;
179 To = Rp[WS(rs, 7)];
180 }
181 T10 = Im[0];
182 T11 = Ip[WS(rs, 7)];
183 Tq = Rp[WS(rs, 3)];
184 T14 = Tn - To;
185 Tp = Tn + To;
186 T1v = T11 - T10;
187 T12 = T10 + T11;
188 Tr = Rm[WS(rs, 4)];
189 T15 = Ip[WS(rs, 3)];
190 T16 = Im[WS(rs, 4)];
191 }
192 }
193 {
194 E T13, T1x, T18, T35, T3g, T3h, T38, TY, T19;
195 {
196 E T2U, T2J, T37, Tt, T36, T2V, T2M, T2K, T2L;
197 T2U = T2H + T2I;
198 T2J = T2H - T2I;
199 {
200 E TZ, Ts, T1w, T17;
201 TZ = Tq - Tr;
202 Ts = Tq + Tr;
203 T1w = T15 - T16;
204 T17 = T15 + T16;
205 T37 = TZ + T12;
206 T13 = TZ - T12;
207 T2K = Tp - Ts;
208 Tt = Tp + Ts;
209 T1x = T1v + T1w;
210 T2L = T1v - T1w;
211 T36 = T14 + T17;
212 T18 = T14 - T17;
213 }
214 T2V = T2L - T2K;
215 T2M = T2K + T2L;
216 T3M = T2J - T2M;
217 T2N = T2J + T2M;
218 T3H = T2V - T2U;
219 T2W = T2U + T2V;
220 T35 = FMA(KP414213562, T34, T33);
221 T3g = FNMS(KP414213562, T33, T34);
222 T25 = Tm - Tt;
223 Tu = Tm + Tt;
224 T3h = FNMS(KP414213562, T36, T37);
225 T38 = FMA(KP414213562, T37, T36);
226 }
227 T1n = FNMS(KP414213562, TS, TX);
228 TY = FMA(KP414213562, TX, TS);
229 T19 = FNMS(KP414213562, T18, T13);
230 T1o = FMA(KP414213562, T13, T18);
231 T3R = T3h - T3g;
232 T3i = T3g + T3h;
233 T2g = T19 - TY;
234 T1a = TY + T19;
235 T21 = T1x - T1u;
236 T1y = T1u + T1x;
237 T3W = T35 + T38;
238 T39 = T35 - T38;
239 }
240 }
241 }
242 {
243 E T27, T22, T2c, T2u, T2x, T2h, T2s, T2A, T2w, T2B, T2v;
244 {
245 E T1K, Tv, T1G, T1N, T1Q, T1b, T2b, T1p, Tw, T1d;
246 T1K = Tf - Tu;
247 Tv = Tf + Tu;
248 T1G = T1y + T1F;
249 T1N = T1F - T1y;
250 T1Q = FNMS(KP923879532, T1a, TN);
251 T1b = FMA(KP923879532, T1a, TN);
252 T2b = T1n - T1o;
253 T1p = T1n + T1o;
254 Tw = W[0];
255 T1d = W[1];
256 {
257 E T1T, T1O, T1W, T1S, T1X, T1R;
258 {
259 E T1J, T1M, T1L, T1V, T1P, T1q;
260 T1T = FNMS(KP923879532, T1p, T1m);
261 T1q = FMA(KP923879532, T1p, T1m);
262 {
263 E T1c, T1I, T1H, T1r;
264 T1c = Tw * T1b;
265 T1J = W[14];
266 T1H = Tw * T1q;
267 T1r = FMA(T1d, T1q, T1c);
268 T1M = W[15];
269 T1L = T1J * T1K;
270 T1I = FNMS(T1d, T1b, T1H);
271 Rm[0] = Tv + T1r;
272 Rp[0] = Tv - T1r;
273 T1V = T1M * T1K;
274 Im[0] = T1I - T1G;
275 Ip[0] = T1G + T1I;
276 T1P = W[16];
277 }
278 T1O = FNMS(T1M, T1N, T1L);
279 T1W = FMA(T1J, T1N, T1V);
280 T1S = W[17];
281 T1X = T1P * T1T;
282 T1R = T1P * T1Q;
283 }
284 {
285 E T2r, T2n, T2q, T2p, T2z, T2t, T2o, T1Y, T1U;
286 T27 = T25 + T26;
287 T2r = T26 - T25;
288 T2o = T20 - T21;
289 T22 = T20 + T21;
290 T1Y = FNMS(T1S, T1Q, T1X);
291 T1U = FMA(T1S, T1T, T1R);
292 T2n = W[22];
293 T2q = W[23];
294 Im[WS(rs, 4)] = T1Y - T1W;
295 Ip[WS(rs, 4)] = T1W + T1Y;
296 Rm[WS(rs, 4)] = T1O + T1U;
297 Rp[WS(rs, 4)] = T1O - T1U;
298 T2p = T2n * T2o;
299 T2z = T2q * T2o;
300 T2c = FMA(KP923879532, T2b, T2a);
301 T2u = FNMS(KP923879532, T2b, T2a);
302 T2x = FNMS(KP923879532, T2g, T2f);
303 T2h = FMA(KP923879532, T2g, T2f);
304 T2t = W[24];
305 T2s = FNMS(T2q, T2r, T2p);
306 T2A = FMA(T2n, T2r, T2z);
307 T2w = W[25];
308 T2B = T2t * T2x;
309 T2v = T2t * T2u;
310 }
311 }
312 }
313 {
314 E T28, T2k, T2e, T2l, T2d;
315 {
316 E T1Z, T24, T23, T2j, T29, T2C, T2y;
317 T2C = FNMS(T2w, T2u, T2B);
318 T2y = FMA(T2w, T2x, T2v);
319 T1Z = W[6];
320 T24 = W[7];
321 Im[WS(rs, 6)] = T2C - T2A;
322 Ip[WS(rs, 6)] = T2A + T2C;
323 Rm[WS(rs, 6)] = T2s + T2y;
324 Rp[WS(rs, 6)] = T2s - T2y;
325 T23 = T1Z * T22;
326 T2j = T24 * T22;
327 T29 = W[8];
328 T28 = FNMS(T24, T27, T23);
329 T2k = FMA(T1Z, T27, T2j);
330 T2e = W[9];
331 T2l = T29 * T2h;
332 T2d = T29 * T2c;
333 }
334 {
335 E T4a, T4d, T3O, T4h, T4b, T40, T48, T4g, T4c, T42, T3Y;
336 {
337 E T3N, T47, T43, T46, T3F, T45, T4f, T3K, T3J, T3S, T3X, T3Z, T49, T41, T3T;
338 E T3U;
339 {
340 E T44, T3I, T2m, T2i, T3P;
341 T44 = FNMS(KP707106781, T3H, T3G);
342 T3I = FMA(KP707106781, T3H, T3G);
343 T2m = FNMS(T2e, T2c, T2l);
344 T2i = FMA(T2e, T2h, T2d);
345 T3N = FMA(KP707106781, T3M, T3L);
346 T47 = FNMS(KP707106781, T3M, T3L);
347 Im[WS(rs, 2)] = T2m - T2k;
348 Ip[WS(rs, 2)] = T2k + T2m;
349 Rm[WS(rs, 2)] = T28 + T2i;
350 Rp[WS(rs, 2)] = T28 - T2i;
351 T43 = W[26];
352 T46 = W[27];
353 T3F = W[10];
354 T45 = T43 * T44;
355 T4f = T46 * T44;
356 T3K = W[11];
357 T3J = T3F * T3I;
358 T4a = FNMS(KP923879532, T3R, T3Q);
359 T3S = FMA(KP923879532, T3R, T3Q);
360 T3X = FNMS(KP923879532, T3W, T3V);
361 T4d = FMA(KP923879532, T3W, T3V);
362 T3Z = T3K * T3I;
363 T3P = W[12];
364 T49 = W[28];
365 T41 = T3P * T3X;
366 T3T = T3P * T3S;
367 }
368 T3O = FNMS(T3K, T3N, T3J);
369 T4h = T49 * T4d;
370 T4b = T49 * T4a;
371 T40 = FMA(T3F, T3N, T3Z);
372 T3U = W[13];
373 T48 = FNMS(T46, T47, T45);
374 T4g = FMA(T43, T47, T4f);
375 T4c = W[29];
376 T42 = FNMS(T3U, T3S, T41);
377 T3Y = FMA(T3U, T3X, T3T);
378 }
379 {
380 E T3t, T2X, T3p, T3s, T2D, T3r, T3B, T2Q, T2P, T3a, T3j, T3l, T3v, T3n, T3b;
381 E T3c;
382 {
383 E T2O, T3q, T4i, T4e, T2Z;
384 T4i = FNMS(T4c, T4a, T4h);
385 T4e = FMA(T4c, T4d, T4b);
386 Im[WS(rs, 3)] = T42 - T40;
387 Ip[WS(rs, 3)] = T40 + T42;
388 Rm[WS(rs, 3)] = T3O + T3Y;
389 Rp[WS(rs, 3)] = T3O - T3Y;
390 Im[WS(rs, 7)] = T4i - T4g;
391 Ip[WS(rs, 7)] = T4g + T4i;
392 Rm[WS(rs, 7)] = T48 + T4e;
393 Rp[WS(rs, 7)] = T48 - T4e;
394 T3t = FNMS(KP707106781, T2W, T2T);
395 T2X = FMA(KP707106781, T2W, T2T);
396 T2O = FMA(KP707106781, T2N, T2G);
397 T3q = FNMS(KP707106781, T2N, T2G);
398 T3p = W[18];
399 T3s = W[19];
400 T2D = W[2];
401 T3r = T3p * T3q;
402 T3B = T3s * T3q;
403 T2Q = W[3];
404 T2P = T2D * T2O;
405 T3a = FMA(KP923879532, T39, T32);
406 T3w = FNMS(KP923879532, T39, T32);
407 T3z = FMA(KP923879532, T3i, T3f);
408 T3j = FNMS(KP923879532, T3i, T3f);
409 T3l = T2Q * T2O;
410 T2Z = W[4];
411 T3v = W[20];
412 T3n = T2Z * T3j;
413 T3b = T2Z * T3a;
414 }
415 T2Y = FNMS(T2Q, T2X, T2P);
416 T3D = T3v * T3z;
417 T3x = T3v * T3w;
418 T3m = FMA(T2D, T2X, T3l);
419 T3c = W[5];
420 T3u = FNMS(T3s, T3t, T3r);
421 T3C = FMA(T3p, T3t, T3B);
422 T3y = W[21];
423 T3o = FNMS(T3c, T3a, T3n);
424 T3k = FMA(T3c, T3j, T3b);
425 }
426 }
427 }
428 }
429 }
430 T3E = FNMS(T3y, T3w, T3D);
431 T3A = FMA(T3y, T3z, T3x);
432 Im[WS(rs, 1)] = T3o - T3m;
433 Ip[WS(rs, 1)] = T3m + T3o;
434 Rm[WS(rs, 1)] = T2Y + T3k;
435 Rp[WS(rs, 1)] = T2Y - T3k;
436 Im[WS(rs, 5)] = T3E - T3C;
437 Ip[WS(rs, 5)] = T3C + T3E;
438 Rm[WS(rs, 5)] = T3u + T3A;
439 Rp[WS(rs, 5)] = T3u - T3A;
440 }
441 }
442 }
443
444 static const tw_instr twinstr[] = {
445 {TW_FULL, 1, 16},
446 {TW_NEXT, 1, 0}
447 };
448
449 static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, {136, 30, 70, 0} };
450
451 void X(codelet_hc2cbdft_16) (planner *p) {
452 X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
453 }
454 #else /* HAVE_FMA */
455
456 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include hc2cb.h */
457
458 /*
459 * This function contains 206 FP additions, 84 FP multiplications,
460 * (or, 168 additions, 46 multiplications, 38 fused multiply/add),
461 * 60 stack variables, 3 constants, and 64 memory accesses
462 */
463 #include "hc2cb.h"
464
465 static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
466 {
467 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
468 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
469 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
470 {
471 INT m;
472 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
473 E TB, T2L, T30, T1n, Tf, T1U, T2H, T3p, T1E, T1Z, TM, T31, T2s, T3k, T1i;
474 E T2M, Tu, T1Y, T2Q, T2X, T2T, T2Y, TY, T1d, T19, T1e, T2v, T2C, T2y, T2D;
475 E T1x, T1V;
476 {
477 E T3, T1j, TA, T1B, T6, Tx, T1m, T1C, Ta, TC, TF, T1y, Td, TH, TK;
478 E T1z;
479 {
480 E T1, T2, Ty, Tz;
481 T1 = Rp[0];
482 T2 = Rm[WS(rs, 7)];
483 T3 = T1 + T2;
484 T1j = T1 - T2;
485 Ty = Ip[0];
486 Tz = Im[WS(rs, 7)];
487 TA = Ty + Tz;
488 T1B = Ty - Tz;
489 }
490 {
491 E T4, T5, T1k, T1l;
492 T4 = Rp[WS(rs, 4)];
493 T5 = Rm[WS(rs, 3)];
494 T6 = T4 + T5;
495 Tx = T4 - T5;
496 T1k = Ip[WS(rs, 4)];
497 T1l = Im[WS(rs, 3)];
498 T1m = T1k + T1l;
499 T1C = T1k - T1l;
500 }
501 {
502 E T8, T9, TD, TE;
503 T8 = Rp[WS(rs, 2)];
504 T9 = Rm[WS(rs, 5)];
505 Ta = T8 + T9;
506 TC = T8 - T9;
507 TD = Ip[WS(rs, 2)];
508 TE = Im[WS(rs, 5)];
509 TF = TD + TE;
510 T1y = TD - TE;
511 }
512 {
513 E Tb, Tc, TI, TJ;
514 Tb = Rm[WS(rs, 1)];
515 Tc = Rp[WS(rs, 6)];
516 Td = Tb + Tc;
517 TH = Tb - Tc;
518 TI = Im[WS(rs, 1)];
519 TJ = Ip[WS(rs, 6)];
520 TK = TI + TJ;
521 T1z = TJ - TI;
522 }
523 {
524 E T7, Te, TG, TL;
525 TB = Tx + TA;
526 T2L = TA - Tx;
527 T30 = T1j + T1m;
528 T1n = T1j - T1m;
529 T7 = T3 + T6;
530 Te = Ta + Td;
531 Tf = T7 + Te;
532 T1U = T7 - Te;
533 {
534 E T2F, T2G, T1A, T1D;
535 T2F = Ta - Td;
536 T2G = T1B - T1C;
537 T2H = T2F + T2G;
538 T3p = T2G - T2F;
539 T1A = T1y + T1z;
540 T1D = T1B + T1C;
541 T1E = T1A + T1D;
542 T1Z = T1D - T1A;
543 }
544 TG = TC + TF;
545 TL = TH + TK;
546 TM = KP707106781 * (TG - TL);
547 T31 = KP707106781 * (TG + TL);
548 {
549 E T2q, T2r, T1g, T1h;
550 T2q = T3 - T6;
551 T2r = T1z - T1y;
552 T2s = T2q + T2r;
553 T3k = T2q - T2r;
554 T1g = TC - TF;
555 T1h = TH - TK;
556 T1i = KP707106781 * (T1g + T1h);
557 T2M = KP707106781 * (T1g - T1h);
558 }
559 }
560 }
561 {
562 E Ti, TT, TR, T1r, Tl, TO, TW, T1s, Tp, T14, T12, T1u, Ts, TZ, T17;
563 E T1v;
564 {
565 E Tg, Th, TP, TQ;
566 Tg = Rp[WS(rs, 1)];
567 Th = Rm[WS(rs, 6)];
568 Ti = Tg + Th;
569 TT = Tg - Th;
570 TP = Ip[WS(rs, 1)];
571 TQ = Im[WS(rs, 6)];
572 TR = TP + TQ;
573 T1r = TP - TQ;
574 }
575 {
576 E Tj, Tk, TU, TV;
577 Tj = Rp[WS(rs, 5)];
578 Tk = Rm[WS(rs, 2)];
579 Tl = Tj + Tk;
580 TO = Tj - Tk;
581 TU = Ip[WS(rs, 5)];
582 TV = Im[WS(rs, 2)];
583 TW = TU + TV;
584 T1s = TU - TV;
585 }
586 {
587 E Tn, To, T10, T11;
588 Tn = Rm[0];
589 To = Rp[WS(rs, 7)];
590 Tp = Tn + To;
591 T14 = Tn - To;
592 T10 = Im[0];
593 T11 = Ip[WS(rs, 7)];
594 T12 = T10 + T11;
595 T1u = T11 - T10;
596 }
597 {
598 E Tq, Tr, T15, T16;
599 Tq = Rp[WS(rs, 3)];
600 Tr = Rm[WS(rs, 4)];
601 Ts = Tq + Tr;
602 TZ = Tq - Tr;
603 T15 = Ip[WS(rs, 3)];
604 T16 = Im[WS(rs, 4)];
605 T17 = T15 + T16;
606 T1v = T15 - T16;
607 }
608 {
609 E Tm, Tt, T2O, T2P;
610 Tm = Ti + Tl;
611 Tt = Tp + Ts;
612 Tu = Tm + Tt;
613 T1Y = Tm - Tt;
614 T2O = TR - TO;
615 T2P = TT + TW;
616 T2Q = FMA(KP382683432, T2O, KP923879532 * T2P);
617 T2X = FNMS(KP923879532, T2O, KP382683432 * T2P);
618 }
619 {
620 E T2R, T2S, TS, TX;
621 T2R = TZ + T12;
622 T2S = T14 + T17;
623 T2T = FMA(KP382683432, T2R, KP923879532 * T2S);
624 T2Y = FNMS(KP923879532, T2R, KP382683432 * T2S);
625 TS = TO + TR;
626 TX = TT - TW;
627 TY = FMA(KP923879532, TS, KP382683432 * TX);
628 T1d = FNMS(KP382683432, TS, KP923879532 * TX);
629 }
630 {
631 E T13, T18, T2t, T2u;
632 T13 = TZ - T12;
633 T18 = T14 - T17;
634 T19 = FNMS(KP382683432, T18, KP923879532 * T13);
635 T1e = FMA(KP382683432, T13, KP923879532 * T18);
636 T2t = Ti - Tl;
637 T2u = T1r - T1s;
638 T2v = T2t - T2u;
639 T2C = T2t + T2u;
640 }
641 {
642 E T2w, T2x, T1t, T1w;
643 T2w = Tp - Ts;
644 T2x = T1u - T1v;
645 T2y = T2w + T2x;
646 T2D = T2x - T2w;
647 T1t = T1r + T1s;
648 T1w = T1u + T1v;
649 T1x = T1t + T1w;
650 T1V = T1w - T1t;
651 }
652 }
653 {
654 E Tv, T1F, T1b, T1N, T1p, T1P, T1L, T1R;
655 Tv = Tf + Tu;
656 T1F = T1x + T1E;
657 {
658 E TN, T1a, T1f, T1o;
659 TN = TB + TM;
660 T1a = TY + T19;
661 T1b = TN + T1a;
662 T1N = TN - T1a;
663 T1f = T1d + T1e;
664 T1o = T1i + T1n;
665 T1p = T1f + T1o;
666 T1P = T1o - T1f;
667 {
668 E T1I, T1K, T1H, T1J;
669 T1I = Tf - Tu;
670 T1K = T1E - T1x;
671 T1H = W[14];
672 T1J = W[15];
673 T1L = FNMS(T1J, T1K, T1H * T1I);
674 T1R = FMA(T1J, T1I, T1H * T1K);
675 }
676 }
677 {
678 E T1q, T1G, Tw, T1c;
679 Tw = W[0];
680 T1c = W[1];
681 T1q = FMA(Tw, T1b, T1c * T1p);
682 T1G = FNMS(T1c, T1b, Tw * T1p);
683 Rp[0] = Tv - T1q;
684 Ip[0] = T1F + T1G;
685 Rm[0] = Tv + T1q;
686 Im[0] = T1G - T1F;
687 }
688 {
689 E T1Q, T1S, T1M, T1O;
690 T1M = W[16];
691 T1O = W[17];
692 T1Q = FMA(T1M, T1N, T1O * T1P);
693 T1S = FNMS(T1O, T1N, T1M * T1P);
694 Rp[WS(rs, 4)] = T1L - T1Q;
695 Ip[WS(rs, 4)] = T1R + T1S;
696 Rm[WS(rs, 4)] = T1L + T1Q;
697 Im[WS(rs, 4)] = T1S - T1R;
698 }
699 }
700 {
701 E T25, T2j, T29, T2l, T21, T2b, T2h, T2n;
702 {
703 E T23, T24, T27, T28;
704 T23 = TB - TM;
705 T24 = T1d - T1e;
706 T25 = T23 + T24;
707 T2j = T23 - T24;
708 T27 = T19 - TY;
709 T28 = T1n - T1i;
710 T29 = T27 + T28;
711 T2l = T28 - T27;
712 }
713 {
714 E T1W, T20, T1T, T1X;
715 T1W = T1U + T1V;
716 T20 = T1Y + T1Z;
717 T1T = W[6];
718 T1X = W[7];
719 T21 = FNMS(T1X, T20, T1T * T1W);
720 T2b = FMA(T1X, T1W, T1T * T20);
721 }
722 {
723 E T2e, T2g, T2d, T2f;
724 T2e = T1U - T1V;
725 T2g = T1Z - T1Y;
726 T2d = W[22];
727 T2f = W[23];
728 T2h = FNMS(T2f, T2g, T2d * T2e);
729 T2n = FMA(T2f, T2e, T2d * T2g);
730 }
731 {
732 E T2a, T2c, T22, T26;
733 T22 = W[8];
734 T26 = W[9];
735 T2a = FMA(T22, T25, T26 * T29);
736 T2c = FNMS(T26, T25, T22 * T29);
737 Rp[WS(rs, 2)] = T21 - T2a;
738 Ip[WS(rs, 2)] = T2b + T2c;
739 Rm[WS(rs, 2)] = T21 + T2a;
740 Im[WS(rs, 2)] = T2c - T2b;
741 }
742 {
743 E T2m, T2o, T2i, T2k;
744 T2i = W[24];
745 T2k = W[25];
746 T2m = FMA(T2i, T2j, T2k * T2l);
747 T2o = FNMS(T2k, T2j, T2i * T2l);
748 Rp[WS(rs, 6)] = T2h - T2m;
749 Ip[WS(rs, 6)] = T2n + T2o;
750 Rm[WS(rs, 6)] = T2h + T2m;
751 Im[WS(rs, 6)] = T2o - T2n;
752 }
753 }
754 {
755 E T2A, T38, T2I, T3a, T2V, T3d, T33, T3f, T2z, T2E;
756 T2z = KP707106781 * (T2v + T2y);
757 T2A = T2s + T2z;
758 T38 = T2s - T2z;
759 T2E = KP707106781 * (T2C + T2D);
760 T2I = T2E + T2H;
761 T3a = T2H - T2E;
762 {
763 E T2N, T2U, T2Z, T32;
764 T2N = T2L + T2M;
765 T2U = T2Q - T2T;
766 T2V = T2N + T2U;
767 T3d = T2N - T2U;
768 T2Z = T2X + T2Y;
769 T32 = T30 - T31;
770 T33 = T2Z + T32;
771 T3f = T32 - T2Z;
772 }
773 {
774 E T2J, T35, T34, T36;
775 {
776 E T2p, T2B, T2K, T2W;
777 T2p = W[2];
778 T2B = W[3];
779 T2J = FNMS(T2B, T2I, T2p * T2A);
780 T35 = FMA(T2B, T2A, T2p * T2I);
781 T2K = W[4];
782 T2W = W[5];
783 T34 = FMA(T2K, T2V, T2W * T33);
784 T36 = FNMS(T2W, T2V, T2K * T33);
785 }
786 Rp[WS(rs, 1)] = T2J - T34;
787 Ip[WS(rs, 1)] = T35 + T36;
788 Rm[WS(rs, 1)] = T2J + T34;
789 Im[WS(rs, 1)] = T36 - T35;
790 }
791 {
792 E T3b, T3h, T3g, T3i;
793 {
794 E T37, T39, T3c, T3e;
795 T37 = W[18];
796 T39 = W[19];
797 T3b = FNMS(T39, T3a, T37 * T38);
798 T3h = FMA(T39, T38, T37 * T3a);
799 T3c = W[20];
800 T3e = W[21];
801 T3g = FMA(T3c, T3d, T3e * T3f);
802 T3i = FNMS(T3e, T3d, T3c * T3f);
803 }
804 Rp[WS(rs, 5)] = T3b - T3g;
805 Ip[WS(rs, 5)] = T3h + T3i;
806 Rm[WS(rs, 5)] = T3b + T3g;
807 Im[WS(rs, 5)] = T3i - T3h;
808 }
809 }
810 {
811 E T3m, T3E, T3q, T3G, T3v, T3J, T3z, T3L, T3l, T3o;
812 T3l = KP707106781 * (T2D - T2C);
813 T3m = T3k + T3l;
814 T3E = T3k - T3l;
815 T3o = KP707106781 * (T2v - T2y);
816 T3q = T3o + T3p;
817 T3G = T3p - T3o;
818 {
819 E T3t, T3u, T3x, T3y;
820 T3t = T2L - T2M;
821 T3u = T2X - T2Y;
822 T3v = T3t + T3u;
823 T3J = T3t - T3u;
824 T3x = T31 + T30;
825 T3y = T2Q + T2T;
826 T3z = T3x - T3y;
827 T3L = T3y + T3x;
828 }
829 {
830 E T3r, T3B, T3A, T3C;
831 {
832 E T3j, T3n, T3s, T3w;
833 T3j = W[10];
834 T3n = W[11];
835 T3r = FNMS(T3n, T3q, T3j * T3m);
836 T3B = FMA(T3n, T3m, T3j * T3q);
837 T3s = W[12];
838 T3w = W[13];
839 T3A = FMA(T3s, T3v, T3w * T3z);
840 T3C = FNMS(T3w, T3v, T3s * T3z);
841 }
842 Rp[WS(rs, 3)] = T3r - T3A;
843 Ip[WS(rs, 3)] = T3B + T3C;
844 Rm[WS(rs, 3)] = T3r + T3A;
845 Im[WS(rs, 3)] = T3C - T3B;
846 }
847 {
848 E T3H, T3N, T3M, T3O;
849 {
850 E T3D, T3F, T3I, T3K;
851 T3D = W[26];
852 T3F = W[27];
853 T3H = FNMS(T3F, T3G, T3D * T3E);
854 T3N = FMA(T3F, T3E, T3D * T3G);
855 T3I = W[28];
856 T3K = W[29];
857 T3M = FMA(T3I, T3J, T3K * T3L);
858 T3O = FNMS(T3K, T3J, T3I * T3L);
859 }
860 Rp[WS(rs, 7)] = T3H - T3M;
861 Ip[WS(rs, 7)] = T3N + T3O;
862 Rm[WS(rs, 7)] = T3H + T3M;
863 Im[WS(rs, 7)] = T3O - T3N;
864 }
865 }
866 }
867 }
868 }
869
870 static const tw_instr twinstr[] = {
871 {TW_FULL, 1, 16},
872 {TW_NEXT, 1, 0}
873 };
874
875 static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, {168, 46, 38, 0} };
876
877 void X(codelet_hc2cbdft_16) (planner *p) {
878 X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
879 }
880 #endif /* HAVE_FMA */