To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

The primary repository for this project is hosted at https://github.com/sonic-visualiser/sv-dependency-builds .
This repository is a read-only copy which is updated automatically every hour.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / src / fftw-3.3.8 / dft / scalar / codelets / t2_16.c @ 167:bd3cc4d1df30

History | View | Annotate | Download (21.9 KB)

1
/*
2
 * Copyright (c) 2003, 2007-14 Matteo Frigo
3
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18
 *
19
 */
20

    
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Thu May 24 08:04:19 EDT 2018 */
23

    
24
#include "dft/codelet-dft.h"
25

    
26
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27

    
28
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
29

    
30
/*
31
 * This function contains 196 FP additions, 134 FP multiplications,
32
 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
33
 * 90 stack variables, 3 constants, and 64 memory accesses
34
 */
35
#include "dft/scalar/t.h"
36

    
37
static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38
{
39
     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
40
     DK(KP414213562, +0.414213562373095048801688724209698078569671875);
41
     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
42
     {
43
          INT m;
44
          for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
45
               E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
46
               E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
47
               {
48
                    E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
49
                    T2 = W[0];
50
                    Tf = W[2];
51
                    Tg = T2 * Tf;
52
                    TM = W[6];
53
                    TN = T2 * TM;
54
                    TO = W[7];
55
                    TS = T2 * TO;
56
                    T3 = W[4];
57
                    T4 = T2 * T3;
58
                    Tp = Tf * T3;
59
                    T6 = W[5];
60
                    Ta = T2 * T6;
61
                    Tt = Tf * T6;
62
                    T5 = W[1];
63
                    Th = W[3];
64
                    Tl = T2 * Th;
65
                    Tz = FMA(T5, Th, Tg);
66
                    Ti = FNMS(T5, Th, Tg);
67
                    T7 = FMA(T5, T6, T4);
68
                    TZ = FNMS(Th, T3, Tt);
69
                    TT = FNMS(T5, TM, TS);
70
                    Tq = FNMS(Th, T6, Tp);
71
                    TW = FMA(Th, T6, Tp);
72
                    Tb = FNMS(T5, T3, Ta);
73
                    Tu = FMA(Th, T3, Tt);
74
                    TP = FMA(T5, TO, TN);
75
                    TI = FMA(T5, T3, Ta);
76
                    TF = FNMS(T5, T6, T4);
77
                    {
78
                         E T1y, T1C, T1e, T1i;
79
                         T1y = Tz * T3;
80
                         T1C = Tz * T6;
81
                         TC = FNMS(T5, Tf, Tl);
82
                         T1z = FMA(TC, T6, T1y);
83
                         T1O = FMA(TC, T3, T1C);
84
                         T1D = FNMS(TC, T3, T1C);
85
                         T1L = FNMS(TC, T6, T1y);
86
                         T1e = Ti * T3;
87
                         T1i = Ti * T6;
88
                         Tm = FMA(T5, Tf, Tl);
89
                         T1f = FMA(Tm, T6, T1e);
90
                         T1p = FMA(Tm, T3, T1i);
91
                         T1j = FNMS(Tm, T3, T1i);
92
                         T1m = FNMS(Tm, T6, T1e);
93
                    }
94
               }
95
               {
96
                    E Te, T1U, T3A, T3L, T1G, T2D, T2A, T3h, T1R, T2B, T2I, T3i, Tx, T3M, T1Z;
97
                    E T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28;
98
                    E T2d, T38;
99
                    {
100
                         E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
101
                         T1 = ri[0];
102
                         T3z = ii[0];
103
                         T8 = ri[WS(rs, 8)];
104
                         T9 = T7 * T8;
105
                         Tc = ii[WS(rs, 8)];
106
                         T3x = T7 * Tc;
107
                         Td = FMA(Tb, Tc, T9);
108
                         Te = T1 + Td;
109
                         T1U = T1 - Td;
110
                         T3y = FNMS(Tb, T8, T3x);
111
                         T3A = T3y + T3z;
112
                         T3L = T3z - T3y;
113
                    }
114
                    {
115
                         E T1u, T1v, T1w, T2w, T1A, T1B, T1E, T2y;
116
                         T1u = ri[WS(rs, 15)];
117
                         T1v = TM * T1u;
118
                         T1w = ii[WS(rs, 15)];
119
                         T2w = TM * T1w;
120
                         T1A = ri[WS(rs, 7)];
121
                         T1B = T1z * T1A;
122
                         T1E = ii[WS(rs, 7)];
123
                         T2y = T1z * T1E;
124
                         {
125
                              E T1x, T1F, T2x, T2z;
126
                              T1x = FMA(TO, T1w, T1v);
127
                              T1F = FMA(T1D, T1E, T1B);
128
                              T1G = T1x + T1F;
129
                              T2D = T1x - T1F;
130
                              T2x = FNMS(TO, T1u, T2w);
131
                              T2z = FNMS(T1D, T1A, T2y);
132
                              T2A = T2x - T2z;
133
                              T3h = T2x + T2z;
134
                         }
135
                    }
136
                    {
137
                         E T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G;
138
                         T1H = ri[WS(rs, 3)];
139
                         T1I = Tf * T1H;
140
                         T1J = ii[WS(rs, 3)];
141
                         T2E = Tf * T1J;
142
                         T1M = ri[WS(rs, 11)];
143
                         T1N = T1L * T1M;
144
                         T1P = ii[WS(rs, 11)];
145
                         T2G = T1L * T1P;
146
                         {
147
                              E T1K, T1Q, T2F, T2H;
148
                              T1K = FMA(Th, T1J, T1I);
149
                              T1Q = FMA(T1O, T1P, T1N);
150
                              T1R = T1K + T1Q;
151
                              T2B = T1K - T1Q;
152
                              T2F = FNMS(Th, T1H, T2E);
153
                              T2H = FNMS(T1O, T1M, T2G);
154
                              T2I = T2F - T2H;
155
                              T3i = T2F + T2H;
156
                         }
157
                    }
158
                    {
159
                         E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
160
                         Tj = ri[WS(rs, 4)];
161
                         Tk = Ti * Tj;
162
                         Tn = ii[WS(rs, 4)];
163
                         T1V = Ti * Tn;
164
                         Tr = ri[WS(rs, 12)];
165
                         Ts = Tq * Tr;
166
                         Tv = ii[WS(rs, 12)];
167
                         T1X = Tq * Tv;
168
                         {
169
                              E To, Tw, T1W, T1Y;
170
                              To = FMA(Tm, Tn, Tk);
171
                              Tw = FMA(Tu, Tv, Ts);
172
                              Tx = To + Tw;
173
                              T3M = To - Tw;
174
                              T1W = FNMS(Tm, Tj, T1V);
175
                              T1Y = FNMS(Tu, Tr, T1X);
176
                              T1Z = T1W - T1Y;
177
                              T3w = T1W + T1Y;
178
                         }
179
                    }
180
                    {
181
                         E TA, TB, TD, T21, TG, TH, TJ, T23;
182
                         TA = ri[WS(rs, 2)];
183
                         TB = Tz * TA;
184
                         TD = ii[WS(rs, 2)];
185
                         T21 = Tz * TD;
186
                         TG = ri[WS(rs, 10)];
187
                         TH = TF * TG;
188
                         TJ = ii[WS(rs, 10)];
189
                         T23 = TF * TJ;
190
                         {
191
                              E TE, TK, T22, T24;
192
                              TE = FMA(TC, TD, TB);
193
                              TK = FMA(TI, TJ, TH);
194
                              TL = TE + TK;
195
                              T26 = TE - TK;
196
                              T22 = FNMS(TC, TA, T21);
197
                              T24 = FNMS(TI, TG, T23);
198
                              T25 = T22 - T24;
199
                              T37 = T22 + T24;
200
                         }
201
                    }
202
                    {
203
                         E T15, T16, T17, T2h, T19, T1a, T1b, T2j;
204
                         T15 = ri[WS(rs, 1)];
205
                         T16 = T2 * T15;
206
                         T17 = ii[WS(rs, 1)];
207
                         T2h = T2 * T17;
208
                         T19 = ri[WS(rs, 9)];
209
                         T1a = T3 * T19;
210
                         T1b = ii[WS(rs, 9)];
211
                         T2j = T3 * T1b;
212
                         {
213
                              E T18, T1c, T2i, T2k;
214
                              T18 = FMA(T5, T17, T16);
215
                              T1c = FMA(T6, T1b, T1a);
216
                              T1d = T18 + T1c;
217
                              T2o = T18 - T1c;
218
                              T2i = FNMS(T5, T15, T2h);
219
                              T2k = FNMS(T6, T19, T2j);
220
                              T2l = T2i - T2k;
221
                              T3c = T2i + T2k;
222
                         }
223
                    }
224
                    {
225
                         E T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r;
226
                         T1g = ri[WS(rs, 5)];
227
                         T1h = T1f * T1g;
228
                         T1k = ii[WS(rs, 5)];
229
                         T2p = T1f * T1k;
230
                         T1n = ri[WS(rs, 13)];
231
                         T1o = T1m * T1n;
232
                         T1q = ii[WS(rs, 13)];
233
                         T2r = T1m * T1q;
234
                         {
235
                              E T1l, T1r, T2q, T2s;
236
                              T1l = FMA(T1j, T1k, T1h);
237
                              T1r = FMA(T1p, T1q, T1o);
238
                              T1s = T1l + T1r;
239
                              T2m = T1l - T1r;
240
                              T2q = FNMS(T1j, T1g, T2p);
241
                              T2s = FNMS(T1p, T1n, T2r);
242
                              T2t = T2q - T2s;
243
                              T3d = T2q + T2s;
244
                         }
245
                    }
246
                    {
247
                         E TQ, TR, TU, T29, TX, TY, T10, T2b;
248
                         TQ = ri[WS(rs, 14)];
249
                         TR = TP * TQ;
250
                         TU = ii[WS(rs, 14)];
251
                         T29 = TP * TU;
252
                         TX = ri[WS(rs, 6)];
253
                         TY = TW * TX;
254
                         T10 = ii[WS(rs, 6)];
255
                         T2b = TW * T10;
256
                         {
257
                              E TV, T11, T2a, T2c;
258
                              TV = FMA(TT, TU, TR);
259
                              T11 = FMA(TZ, T10, TY);
260
                              T12 = TV + T11;
261
                              T28 = TV - T11;
262
                              T2a = FNMS(TT, TQ, T29);
263
                              T2c = FNMS(TZ, TX, T2b);
264
                              T2d = T2a - T2c;
265
                              T38 = T2a + T2c;
266
                         }
267
                    }
268
                    {
269
                         E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
270
                         {
271
                              E Ty, T13, T3v, T3B;
272
                              Ty = Te + Tx;
273
                              T13 = TL + T12;
274
                              T14 = Ty + T13;
275
                              T3q = Ty - T13;
276
                              T3v = T37 + T38;
277
                              T3B = T3w + T3A;
278
                              T3C = T3v + T3B;
279
                              T3E = T3B - T3v;
280
                         }
281
                         {
282
                              E T1t, T1S, T3r, T3s;
283
                              T1t = T1d + T1s;
284
                              T1S = T1G + T1R;
285
                              T1T = T1t + T1S;
286
                              T3D = T1S - T1t;
287
                              T3r = T3c + T3d;
288
                              T3s = T3h + T3i;
289
                              T3t = T3r - T3s;
290
                              T3u = T3r + T3s;
291
                         }
292
                         ri[WS(rs, 8)] = T14 - T1T;
293
                         ii[WS(rs, 8)] = T3C - T3u;
294
                         ri[0] = T14 + T1T;
295
                         ii[0] = T3u + T3C;
296
                         ri[WS(rs, 12)] = T3q - T3t;
297
                         ii[WS(rs, 12)] = T3E - T3D;
298
                         ri[WS(rs, 4)] = T3q + T3t;
299
                         ii[WS(rs, 4)] = T3D + T3E;
300
                    }
301
                    {
302
                         E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
303
                         {
304
                              E T36, T39, T3F, T3G;
305
                              T36 = Te - Tx;
306
                              T39 = T37 - T38;
307
                              T3a = T36 + T39;
308
                              T3m = T36 - T39;
309
                              T3F = T12 - TL;
310
                              T3G = T3A - T3w;
311
                              T3H = T3F + T3G;
312
                              T3J = T3G - T3F;
313
                         }
314
                         {
315
                              E T3b, T3e, T3g, T3j;
316
                              T3b = T1d - T1s;
317
                              T3e = T3c - T3d;
318
                              T3f = T3b + T3e;
319
                              T3n = T3e - T3b;
320
                              T3g = T1G - T1R;
321
                              T3j = T3h - T3i;
322
                              T3k = T3g - T3j;
323
                              T3o = T3g + T3j;
324
                         }
325
                         {
326
                              E T3l, T3I, T3p, T3K;
327
                              T3l = T3f + T3k;
328
                              ri[WS(rs, 10)] = FNMS(KP707106781, T3l, T3a);
329
                              ri[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
330
                              T3I = T3n + T3o;
331
                              ii[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
332
                              ii[WS(rs, 10)] = FNMS(KP707106781, T3I, T3H);
333
                              T3p = T3n - T3o;
334
                              ri[WS(rs, 14)] = FNMS(KP707106781, T3p, T3m);
335
                              ri[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
336
                              T3K = T3k - T3f;
337
                              ii[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
338
                              ii[WS(rs, 14)] = FNMS(KP707106781, T3K, T3J);
339
                         }
340
                    }
341
                    {
342
                         E T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K;
343
                         E T2O;
344
                         {
345
                              E T27, T2e, T2n, T2u;
346
                              T20 = T1U - T1Z;
347
                              T3N = T3L - T3M;
348
                              T3T = T3M + T3L;
349
                              T2Q = T1U + T1Z;
350
                              T27 = T25 - T26;
351
                              T2e = T28 + T2d;
352
                              T2f = T27 - T2e;
353
                              T3O = T27 + T2e;
354
                              {
355
                                   E T2Y, T2Z, T2R, T2S;
356
                                   T2Y = T2D + T2I;
357
                                   T2Z = T2A - T2B;
358
                                   T30 = FNMS(KP414213562, T2Z, T2Y);
359
                                   T34 = FMA(KP414213562, T2Y, T2Z);
360
                                   T2R = T26 + T25;
361
                                   T2S = T28 - T2d;
362
                                   T2T = T2R + T2S;
363
                                   T3U = T2S - T2R;
364
                              }
365
                              T2n = T2l + T2m;
366
                              T2u = T2o - T2t;
367
                              T2v = FMA(KP414213562, T2u, T2n);
368
                              T2N = FNMS(KP414213562, T2n, T2u);
369
                              {
370
                                   E T2V, T2W, T2C, T2J;
371
                                   T2V = T2o + T2t;
372
                                   T2W = T2l - T2m;
373
                                   T2X = FMA(KP414213562, T2W, T2V);
374
                                   T33 = FNMS(KP414213562, T2V, T2W);
375
                                   T2C = T2A + T2B;
376
                                   T2J = T2D - T2I;
377
                                   T2K = FNMS(KP414213562, T2J, T2C);
378
                                   T2O = FMA(KP414213562, T2C, T2J);
379
                              }
380
                         }
381
                         {
382
                              E T2g, T2L, T3V, T3W;
383
                              T2g = FMA(KP707106781, T2f, T20);
384
                              T2L = T2v - T2K;
385
                              ri[WS(rs, 11)] = FNMS(KP923879532, T2L, T2g);
386
                              ri[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
387
                              T3V = FMA(KP707106781, T3U, T3T);
388
                              T3W = T2O - T2N;
389
                              ii[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
390
                              ii[WS(rs, 11)] = FNMS(KP923879532, T3W, T3V);
391
                         }
392
                         {
393
                              E T2M, T2P, T3X, T3Y;
394
                              T2M = FNMS(KP707106781, T2f, T20);
395
                              T2P = T2N + T2O;
396
                              ri[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
397
                              ri[WS(rs, 15)] = FMA(KP923879532, T2P, T2M);
398
                              T3X = FNMS(KP707106781, T3U, T3T);
399
                              T3Y = T2v + T2K;
400
                              ii[WS(rs, 7)] = FNMS(KP923879532, T3Y, T3X);
401
                              ii[WS(rs, 15)] = FMA(KP923879532, T3Y, T3X);
402
                         }
403
                         {
404
                              E T2U, T31, T3P, T3Q;
405
                              T2U = FMA(KP707106781, T2T, T2Q);
406
                              T31 = T2X + T30;
407
                              ri[WS(rs, 9)] = FNMS(KP923879532, T31, T2U);
408
                              ri[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
409
                              T3P = FMA(KP707106781, T3O, T3N);
410
                              T3Q = T33 + T34;
411
                              ii[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
412
                              ii[WS(rs, 9)] = FNMS(KP923879532, T3Q, T3P);
413
                         }
414
                         {
415
                              E T32, T35, T3R, T3S;
416
                              T32 = FNMS(KP707106781, T2T, T2Q);
417
                              T35 = T33 - T34;
418
                              ri[WS(rs, 13)] = FNMS(KP923879532, T35, T32);
419
                              ri[WS(rs, 5)] = FMA(KP923879532, T35, T32);
420
                              T3R = FNMS(KP707106781, T3O, T3N);
421
                              T3S = T30 - T2X;
422
                              ii[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
423
                              ii[WS(rs, 13)] = FNMS(KP923879532, T3S, T3R);
424
                         }
425
                    }
426
               }
427
          }
428
     }
429
}
430

    
431
static const tw_instr twinstr[] = {
432
     {TW_CEXP, 0, 1},
433
     {TW_CEXP, 0, 3},
434
     {TW_CEXP, 0, 9},
435
     {TW_CEXP, 0, 15},
436
     {TW_NEXT, 1, 0}
437
};
438

    
439
static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, {104, 42, 92, 0}, 0, 0, 0 };
440

    
441
void X(codelet_t2_16) (planner *p) {
442
     X(kdft_dit_register) (p, t2_16, &desc);
443
}
444
#else
445

    
446
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
447

    
448
/*
449
 * This function contains 196 FP additions, 108 FP multiplications,
450
 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
451
 * 82 stack variables, 3 constants, and 64 memory accesses
452
 */
453
#include "dft/scalar/t.h"
454

    
455
static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
456
{
457
     DK(KP382683432, +0.382683432365089771728459984030398866761344562);
458
     DK(KP923879532, +0.923879532511286756128183189396788286822416626);
459
     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
460
     {
461
          INT m;
462
          for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
463
               E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
464
               E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
465
               {
466
                    E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
467
                    {
468
                         E Th, Tn, Tj, Tm;
469
                         T2 = W[0];
470
                         T5 = W[1];
471
                         Tg = W[2];
472
                         Ti = W[3];
473
                         Th = T2 * Tg;
474
                         Tn = T5 * Tg;
475
                         Tj = T5 * Ti;
476
                         Tm = T2 * Ti;
477
                         Tk = Th - Tj;
478
                         To = Tm + Tn;
479
                         TE = Tm - Tn;
480
                         TC = Th + Tj;
481
                         T6 = W[5];
482
                         T7 = T5 * T6;
483
                         Tv = Tg * T6;
484
                         Ta = T2 * T6;
485
                         Ts = Ti * T6;
486
                         T3 = W[4];
487
                         T4 = T2 * T3;
488
                         Tw = Ti * T3;
489
                         Tb = T5 * T3;
490
                         Tr = Tg * T3;
491
                    }
492
                    T8 = T4 + T7;
493
                    TW = Tv - Tw;
494
                    TJ = Ta + Tb;
495
                    Tt = Tr - Ts;
496
                    TU = Tr + Ts;
497
                    Tc = Ta - Tb;
498
                    Tx = Tv + Tw;
499
                    TH = T4 - T7;
500
                    TN = W[6];
501
                    TO = W[7];
502
                    TP = FMA(T2, TN, T5 * TO);
503
                    TR = FNMS(T5, TN, T2 * TO);
504
                    {
505
                         E T1d, T1e, T19, T1a;
506
                         T1d = Tk * T6;
507
                         T1e = To * T3;
508
                         T1f = T1d - T1e;
509
                         T1k = T1d + T1e;
510
                         T19 = Tk * T3;
511
                         T1a = To * T6;
512
                         T1b = T19 + T1a;
513
                         T1i = T19 - T1a;
514
                    }
515
                    {
516
                         E T1w, T1x, T1s, T1t;
517
                         T1w = TC * T6;
518
                         T1x = TE * T3;
519
                         T1y = T1w - T1x;
520
                         T1H = T1w + T1x;
521
                         T1s = TC * T3;
522
                         T1t = TE * T6;
523
                         T1u = T1s + T1t;
524
                         T1F = T1s - T1t;
525
                    }
526
               }
527
               {
528
                    E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
529
                    E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
530
                    E T2S, T2T, T28, T2A, T2d, T2B;
531
                    {
532
                         E T1, T3d, Te, T3c, T9, Td;
533
                         T1 = ri[0];
534
                         T3d = ii[0];
535
                         T9 = ri[WS(rs, 8)];
536
                         Td = ii[WS(rs, 8)];
537
                         Te = FMA(T8, T9, Tc * Td);
538
                         T3c = FNMS(Tc, T9, T8 * Td);
539
                         Tf = T1 + Te;
540
                         T3r = T3d - T3c;
541
                         T1N = T1 - Te;
542
                         T3e = T3c + T3d;
543
                    }
544
                    {
545
                         E Tq, T1O, Tz, T1P;
546
                         {
547
                              E Tl, Tp, Tu, Ty;
548
                              Tl = ri[WS(rs, 4)];
549
                              Tp = ii[WS(rs, 4)];
550
                              Tq = FMA(Tk, Tl, To * Tp);
551
                              T1O = FNMS(To, Tl, Tk * Tp);
552
                              Tu = ri[WS(rs, 12)];
553
                              Ty = ii[WS(rs, 12)];
554
                              Tz = FMA(Tt, Tu, Tx * Ty);
555
                              T1P = FNMS(Tx, Tu, Tt * Ty);
556
                         }
557
                         TA = Tq + Tz;
558
                         T3s = Tq - Tz;
559
                         T1Q = T1O - T1P;
560
                         T3b = T1O + T1P;
561
                    }
562
                    {
563
                         E TG, T1S, TL, T1T, T1U, T1V;
564
                         {
565
                              E TD, TF, TI, TK;
566
                              TD = ri[WS(rs, 2)];
567
                              TF = ii[WS(rs, 2)];
568
                              TG = FMA(TC, TD, TE * TF);
569
                              T1S = FNMS(TE, TD, TC * TF);
570
                              TI = ri[WS(rs, 10)];
571
                              TK = ii[WS(rs, 10)];
572
                              TL = FMA(TH, TI, TJ * TK);
573
                              T1T = FNMS(TJ, TI, TH * TK);
574
                         }
575
                         TM = TG + TL;
576
                         T2M = T1S + T1T;
577
                         T1U = T1S - T1T;
578
                         T1V = TG - TL;
579
                         T1W = T1U - T1V;
580
                         T2w = T1V + T1U;
581
                    }
582
                    {
583
                         E TT, T1Y, TY, T1Z, T1X, T20;
584
                         {
585
                              E TQ, TS, TV, TX;
586
                              TQ = ri[WS(rs, 14)];
587
                              TS = ii[WS(rs, 14)];
588
                              TT = FMA(TP, TQ, TR * TS);
589
                              T1Y = FNMS(TR, TQ, TP * TS);
590
                              TV = ri[WS(rs, 6)];
591
                              TX = ii[WS(rs, 6)];
592
                              TY = FMA(TU, TV, TW * TX);
593
                              T1Z = FNMS(TW, TV, TU * TX);
594
                         }
595
                         TZ = TT + TY;
596
                         T2N = T1Y + T1Z;
597
                         T1X = TT - TY;
598
                         T20 = T1Y - T1Z;
599
                         T21 = T1X + T20;
600
                         T2x = T1X - T20;
601
                    }
602
                    {
603
                         E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
604
                         {
605
                              E T1p, T1q, T1G, T1I;
606
                              T1p = ri[WS(rs, 15)];
607
                              T1q = ii[WS(rs, 15)];
608
                              T1r = FMA(TN, T1p, TO * T1q);
609
                              T2k = FNMS(TO, T1p, TN * T1q);
610
                              T1G = ri[WS(rs, 11)];
611
                              T1I = ii[WS(rs, 11)];
612
                              T1J = FMA(T1F, T1G, T1H * T1I);
613
                              T2h = FNMS(T1H, T1G, T1F * T1I);
614
                         }
615
                         {
616
                              E T1v, T1z, T1C, T1D;
617
                              T1v = ri[WS(rs, 7)];
618
                              T1z = ii[WS(rs, 7)];
619
                              T1A = FMA(T1u, T1v, T1y * T1z);
620
                              T2l = FNMS(T1y, T1v, T1u * T1z);
621
                              T1C = ri[WS(rs, 3)];
622
                              T1D = ii[WS(rs, 3)];
623
                              T1E = FMA(Tg, T1C, Ti * T1D);
624
                              T2g = FNMS(Ti, T1C, Tg * T1D);
625
                         }
626
                         T1B = T1r + T1A;
627
                         T1K = T1E + T1J;
628
                         T2V = T1B - T1K;
629
                         T2W = T2k + T2l;
630
                         T2X = T2g + T2h;
631
                         T2Y = T2W - T2X;
632
                         {
633
                              E T2f, T2i, T2m, T2n;
634
                              T2f = T1r - T1A;
635
                              T2i = T2g - T2h;
636
                              T2j = T2f - T2i;
637
                              T2D = T2f + T2i;
638
                              T2m = T2k - T2l;
639
                              T2n = T1E - T1J;
640
                              T2o = T2m + T2n;
641
                              T2E = T2m - T2n;
642
                         }
643
                    }
644
                    {
645
                         E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
646
                         {
647
                              E T12, T13, T1j, T1l;
648
                              T12 = ri[WS(rs, 1)];
649
                              T13 = ii[WS(rs, 1)];
650
                              T14 = FMA(T2, T12, T5 * T13);
651
                              T24 = FNMS(T5, T12, T2 * T13);
652
                              T1j = ri[WS(rs, 13)];
653
                              T1l = ii[WS(rs, 13)];
654
                              T1m = FMA(T1i, T1j, T1k * T1l);
655
                              T2b = FNMS(T1k, T1j, T1i * T1l);
656
                         }
657
                         {
658
                              E T15, T16, T1c, T1g;
659
                              T15 = ri[WS(rs, 9)];
660
                              T16 = ii[WS(rs, 9)];
661
                              T17 = FMA(T3, T15, T6 * T16);
662
                              T25 = FNMS(T6, T15, T3 * T16);
663
                              T1c = ri[WS(rs, 5)];
664
                              T1g = ii[WS(rs, 5)];
665
                              T1h = FMA(T1b, T1c, T1f * T1g);
666
                              T2a = FNMS(T1f, T1c, T1b * T1g);
667
                         }
668
                         T18 = T14 + T17;
669
                         T1n = T1h + T1m;
670
                         T2Q = T18 - T1n;
671
                         T2R = T24 + T25;
672
                         T2S = T2a + T2b;
673
                         T2T = T2R - T2S;
674
                         {
675
                              E T26, T27, T29, T2c;
676
                              T26 = T24 - T25;
677
                              T27 = T1h - T1m;
678
                              T28 = T26 + T27;
679
                              T2A = T26 - T27;
680
                              T29 = T14 - T17;
681
                              T2c = T2a - T2b;
682
                              T2d = T29 - T2c;
683
                              T2B = T29 + T2c;
684
                         }
685
                    }
686
                    {
687
                         E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
688
                         {
689
                              E T1R, T22, T3y, T3z;
690
                              T1R = T1N - T1Q;
691
                              T22 = KP707106781 * (T1W - T21);
692
                              T23 = T1R + T22;
693
                              T2r = T1R - T22;
694
                              T3y = KP707106781 * (T2x - T2w);
695
                              T3z = T3s + T3r;
696
                              T3A = T3y + T3z;
697
                              T3C = T3z - T3y;
698
                         }
699
                         {
700
                              E T2e, T2p, T2s, T2t;
701
                              T2e = FMA(KP923879532, T28, KP382683432 * T2d);
702
                              T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
703
                              T2q = T2e + T2p;
704
                              T3B = T2p - T2e;
705
                              T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
706
                              T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
707
                              T2u = T2s - T2t;
708
                              T3x = T2s + T2t;
709
                         }
710
                         ri[WS(rs, 11)] = T23 - T2q;
711
                         ii[WS(rs, 11)] = T3A - T3x;
712
                         ri[WS(rs, 3)] = T23 + T2q;
713
                         ii[WS(rs, 3)] = T3x + T3A;
714
                         ri[WS(rs, 15)] = T2r - T2u;
715
                         ii[WS(rs, 15)] = T3C - T3B;
716
                         ri[WS(rs, 7)] = T2r + T2u;
717
                         ii[WS(rs, 7)] = T3B + T3C;
718
                    }
719
                    {
720
                         E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
721
                         {
722
                              E T2L, T2O, T3k, T3l;
723
                              T2L = Tf - TA;
724
                              T2O = T2M - T2N;
725
                              T2P = T2L + T2O;
726
                              T31 = T2L - T2O;
727
                              T3k = TZ - TM;
728
                              T3l = T3e - T3b;
729
                              T3m = T3k + T3l;
730
                              T3o = T3l - T3k;
731
                         }
732
                         {
733
                              E T2U, T2Z, T32, T33;
734
                              T2U = T2Q + T2T;
735
                              T2Z = T2V - T2Y;
736
                              T30 = KP707106781 * (T2U + T2Z);
737
                              T3n = KP707106781 * (T2Z - T2U);
738
                              T32 = T2T - T2Q;
739
                              T33 = T2V + T2Y;
740
                              T34 = KP707106781 * (T32 - T33);
741
                              T3j = KP707106781 * (T32 + T33);
742
                         }
743
                         ri[WS(rs, 10)] = T2P - T30;
744
                         ii[WS(rs, 10)] = T3m - T3j;
745
                         ri[WS(rs, 2)] = T2P + T30;
746
                         ii[WS(rs, 2)] = T3j + T3m;
747
                         ri[WS(rs, 14)] = T31 - T34;
748
                         ii[WS(rs, 14)] = T3o - T3n;
749
                         ri[WS(rs, 6)] = T31 + T34;
750
                         ii[WS(rs, 6)] = T3n + T3o;
751
                    }
752
                    {
753
                         E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
754
                         {
755
                              E T2v, T2y, T3q, T3t;
756
                              T2v = T1N + T1Q;
757
                              T2y = KP707106781 * (T2w + T2x);
758
                              T2z = T2v + T2y;
759
                              T2H = T2v - T2y;
760
                              T3q = KP707106781 * (T1W + T21);
761
                              T3t = T3r - T3s;
762
                              T3u = T3q + T3t;
763
                              T3w = T3t - T3q;
764
                         }
765
                         {
766
                              E T2C, T2F, T2I, T2J;
767
                              T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
768
                              T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
769
                              T2G = T2C + T2F;
770
                              T3v = T2F - T2C;
771
                              T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
772
                              T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
773
                              T2K = T2I - T2J;
774
                              T3p = T2I + T2J;
775
                         }
776
                         ri[WS(rs, 9)] = T2z - T2G;
777
                         ii[WS(rs, 9)] = T3u - T3p;
778
                         ri[WS(rs, 1)] = T2z + T2G;
779
                         ii[WS(rs, 1)] = T3p + T3u;
780
                         ri[WS(rs, 13)] = T2H - T2K;
781
                         ii[WS(rs, 13)] = T3w - T3v;
782
                         ri[WS(rs, 5)] = T2H + T2K;
783
                         ii[WS(rs, 5)] = T3v + T3w;
784
                    }
785
                    {
786
                         E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
787
                         {
788
                              E TB, T10, T3a, T3f;
789
                              TB = Tf + TA;
790
                              T10 = TM + TZ;
791
                              T11 = TB + T10;
792
                              T35 = TB - T10;
793
                              T3a = T2M + T2N;
794
                              T3f = T3b + T3e;
795
                              T3g = T3a + T3f;
796
                              T3i = T3f - T3a;
797
                         }
798
                         {
799
                              E T1o, T1L, T36, T37;
800
                              T1o = T18 + T1n;
801
                              T1L = T1B + T1K;
802
                              T1M = T1o + T1L;
803
                              T3h = T1L - T1o;
804
                              T36 = T2R + T2S;
805
                              T37 = T2W + T2X;
806
                              T38 = T36 - T37;
807
                              T39 = T36 + T37;
808
                         }
809
                         ri[WS(rs, 8)] = T11 - T1M;
810
                         ii[WS(rs, 8)] = T3g - T39;
811
                         ri[0] = T11 + T1M;
812
                         ii[0] = T39 + T3g;
813
                         ri[WS(rs, 12)] = T35 - T38;
814
                         ii[WS(rs, 12)] = T3i - T3h;
815
                         ri[WS(rs, 4)] = T35 + T38;
816
                         ii[WS(rs, 4)] = T3h + T3i;
817
                    }
818
               }
819
          }
820
     }
821
}
822

    
823
static const tw_instr twinstr[] = {
824
     {TW_CEXP, 0, 1},
825
     {TW_CEXP, 0, 3},
826
     {TW_CEXP, 0, 9},
827
     {TW_CEXP, 0, 15},
828
     {TW_NEXT, 1, 0}
829
};
830

    
831
static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, {156, 68, 40, 0}, 0, 0, 0 };
832

    
833
void X(codelet_t2_16) (planner *p) {
834
     X(kdft_dit_register) (p, t2_16, &desc);
835
}
836
#endif