To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

The primary repository for this project is hosted at https://github.com/sonic-visualiser/sv-dependency-builds .
This repository is a read-only copy which is updated automatically every hour.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / src / fftw-3.3.8 / dft / scalar / codelets / t1_20.c @ 167:bd3cc4d1df30

History | View | Annotate | Download (26.6 KB)

1
/*
2
 * Copyright (c) 2003, 2007-14 Matteo Frigo
3
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18
 *
19
 */
20

    
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Thu May 24 08:04:15 EDT 2018 */
23

    
24
#include "dft/codelet-dft.h"
25

    
26
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27

    
28
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name t1_20 -include dft/scalar/t.h */
29

    
30
/*
31
 * This function contains 246 FP additions, 148 FP multiplications,
32
 * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
33
 * 61 stack variables, 4 constants, and 80 memory accesses
34
 */
35
#include "dft/scalar/t.h"
36

    
37
static void t1_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38
{
39
     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
40
     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
41
     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
42
     DK(KP618033988, +0.618033988749894848204586834365638117720309180);
43
     {
44
          INT m;
45
          for (m = mb, W = W + (mb * 38); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
46
               E T8, T4N, T2i, T4r, Tl, T4O, T2n, T4n, TN, T2b, T40, T4b, T2v, T3v, T3i;
47
               E T3F, T27, T2f, T3W, T4f, T2R, T3z, T3a, T3J, T1G, T2e, T3T, T4e, T2K, T3y;
48
               E T33, T3I, T1e, T2c, T43, T4c, T2C, T3w, T3p, T3G;
49
               {
50
                    E T1, T4q, T3, T6, T4, T4o, T2, T7, T4p, T5;
51
                    T1 = ri[0];
52
                    T4q = ii[0];
53
                    T3 = ri[WS(rs, 10)];
54
                    T6 = ii[WS(rs, 10)];
55
                    T2 = W[18];
56
                    T4 = T2 * T3;
57
                    T4o = T2 * T6;
58
                    T5 = W[19];
59
                    T7 = FMA(T5, T6, T4);
60
                    T4p = FNMS(T5, T3, T4o);
61
                    T8 = T1 + T7;
62
                    T4N = T4q - T4p;
63
                    T2i = T1 - T7;
64
                    T4r = T4p + T4q;
65
               }
66
               {
67
                    E Ta, Td, Tb, T2j, Tg, Tj, Th, T2l, T9, Tf;
68
                    Ta = ri[WS(rs, 5)];
69
                    Td = ii[WS(rs, 5)];
70
                    T9 = W[8];
71
                    Tb = T9 * Ta;
72
                    T2j = T9 * Td;
73
                    Tg = ri[WS(rs, 15)];
74
                    Tj = ii[WS(rs, 15)];
75
                    Tf = W[28];
76
                    Th = Tf * Tg;
77
                    T2l = Tf * Tj;
78
                    {
79
                         E Te, T2k, Tk, T2m, Tc, Ti;
80
                         Tc = W[9];
81
                         Te = FMA(Tc, Td, Tb);
82
                         T2k = FNMS(Tc, Ta, T2j);
83
                         Ti = W[29];
84
                         Tk = FMA(Ti, Tj, Th);
85
                         T2m = FNMS(Ti, Tg, T2l);
86
                         Tl = Te + Tk;
87
                         T4O = Te - Tk;
88
                         T2n = T2k - T2m;
89
                         T4n = T2k + T2m;
90
                    }
91
               }
92
               {
93
                    E Ts, T3d, TL, T2t, Ty, T3f, TF, T2r;
94
                    {
95
                         E To, Tr, Tp, T3c, Tn, Tq;
96
                         To = ri[WS(rs, 4)];
97
                         Tr = ii[WS(rs, 4)];
98
                         Tn = W[6];
99
                         Tp = Tn * To;
100
                         T3c = Tn * Tr;
101
                         Tq = W[7];
102
                         Ts = FMA(Tq, Tr, Tp);
103
                         T3d = FNMS(Tq, To, T3c);
104
                    }
105
                    {
106
                         E TH, TK, TI, T2s, TG, TJ;
107
                         TH = ri[WS(rs, 19)];
108
                         TK = ii[WS(rs, 19)];
109
                         TG = W[36];
110
                         TI = TG * TH;
111
                         T2s = TG * TK;
112
                         TJ = W[37];
113
                         TL = FMA(TJ, TK, TI);
114
                         T2t = FNMS(TJ, TH, T2s);
115
                    }
116
                    {
117
                         E Tu, Tx, Tv, T3e, Tt, Tw;
118
                         Tu = ri[WS(rs, 14)];
119
                         Tx = ii[WS(rs, 14)];
120
                         Tt = W[26];
121
                         Tv = Tt * Tu;
122
                         T3e = Tt * Tx;
123
                         Tw = W[27];
124
                         Ty = FMA(Tw, Tx, Tv);
125
                         T3f = FNMS(Tw, Tu, T3e);
126
                    }
127
                    {
128
                         E TB, TE, TC, T2q, TA, TD;
129
                         TB = ri[WS(rs, 9)];
130
                         TE = ii[WS(rs, 9)];
131
                         TA = W[16];
132
                         TC = TA * TB;
133
                         T2q = TA * TE;
134
                         TD = W[17];
135
                         TF = FMA(TD, TE, TC);
136
                         T2r = FNMS(TD, TB, T2q);
137
                    }
138
                    {
139
                         E Tz, TM, T3Y, T3Z;
140
                         Tz = Ts + Ty;
141
                         TM = TF + TL;
142
                         TN = Tz - TM;
143
                         T2b = Tz + TM;
144
                         T3Y = T3d + T3f;
145
                         T3Z = T2r + T2t;
146
                         T40 = T3Y - T3Z;
147
                         T4b = T3Y + T3Z;
148
                    }
149
                    {
150
                         E T2p, T2u, T3g, T3h;
151
                         T2p = Ts - Ty;
152
                         T2u = T2r - T2t;
153
                         T2v = T2p - T2u;
154
                         T3v = T2p + T2u;
155
                         T3g = T3d - T3f;
156
                         T3h = TF - TL;
157
                         T3i = T3g + T3h;
158
                         T3F = T3g - T3h;
159
                    }
160
               }
161
               {
162
                    E T1M, T35, T25, T2P, T1S, T37, T1Z, T2N;
163
                    {
164
                         E T1I, T1L, T1J, T34, T1H, T1K;
165
                         T1I = ri[WS(rs, 12)];
166
                         T1L = ii[WS(rs, 12)];
167
                         T1H = W[22];
168
                         T1J = T1H * T1I;
169
                         T34 = T1H * T1L;
170
                         T1K = W[23];
171
                         T1M = FMA(T1K, T1L, T1J);
172
                         T35 = FNMS(T1K, T1I, T34);
173
                    }
174
                    {
175
                         E T21, T24, T22, T2O, T20, T23;
176
                         T21 = ri[WS(rs, 7)];
177
                         T24 = ii[WS(rs, 7)];
178
                         T20 = W[12];
179
                         T22 = T20 * T21;
180
                         T2O = T20 * T24;
181
                         T23 = W[13];
182
                         T25 = FMA(T23, T24, T22);
183
                         T2P = FNMS(T23, T21, T2O);
184
                    }
185
                    {
186
                         E T1O, T1R, T1P, T36, T1N, T1Q;
187
                         T1O = ri[WS(rs, 2)];
188
                         T1R = ii[WS(rs, 2)];
189
                         T1N = W[2];
190
                         T1P = T1N * T1O;
191
                         T36 = T1N * T1R;
192
                         T1Q = W[3];
193
                         T1S = FMA(T1Q, T1R, T1P);
194
                         T37 = FNMS(T1Q, T1O, T36);
195
                    }
196
                    {
197
                         E T1V, T1Y, T1W, T2M, T1U, T1X;
198
                         T1V = ri[WS(rs, 17)];
199
                         T1Y = ii[WS(rs, 17)];
200
                         T1U = W[32];
201
                         T1W = T1U * T1V;
202
                         T2M = T1U * T1Y;
203
                         T1X = W[33];
204
                         T1Z = FMA(T1X, T1Y, T1W);
205
                         T2N = FNMS(T1X, T1V, T2M);
206
                    }
207
                    {
208
                         E T1T, T26, T3U, T3V;
209
                         T1T = T1M + T1S;
210
                         T26 = T1Z + T25;
211
                         T27 = T1T - T26;
212
                         T2f = T1T + T26;
213
                         T3U = T35 + T37;
214
                         T3V = T2N + T2P;
215
                         T3W = T3U - T3V;
216
                         T4f = T3U + T3V;
217
                    }
218
                    {
219
                         E T2L, T2Q, T38, T39;
220
                         T2L = T1M - T1S;
221
                         T2Q = T2N - T2P;
222
                         T2R = T2L - T2Q;
223
                         T3z = T2L + T2Q;
224
                         T38 = T35 - T37;
225
                         T39 = T1Z - T25;
226
                         T3a = T38 + T39;
227
                         T3J = T38 - T39;
228
                    }
229
               }
230
               {
231
                    E T1l, T2Y, T1E, T2I, T1r, T30, T1y, T2G;
232
                    {
233
                         E T1h, T1k, T1i, T2X, T1g, T1j;
234
                         T1h = ri[WS(rs, 8)];
235
                         T1k = ii[WS(rs, 8)];
236
                         T1g = W[14];
237
                         T1i = T1g * T1h;
238
                         T2X = T1g * T1k;
239
                         T1j = W[15];
240
                         T1l = FMA(T1j, T1k, T1i);
241
                         T2Y = FNMS(T1j, T1h, T2X);
242
                    }
243
                    {
244
                         E T1A, T1D, T1B, T2H, T1z, T1C;
245
                         T1A = ri[WS(rs, 3)];
246
                         T1D = ii[WS(rs, 3)];
247
                         T1z = W[4];
248
                         T1B = T1z * T1A;
249
                         T2H = T1z * T1D;
250
                         T1C = W[5];
251
                         T1E = FMA(T1C, T1D, T1B);
252
                         T2I = FNMS(T1C, T1A, T2H);
253
                    }
254
                    {
255
                         E T1n, T1q, T1o, T2Z, T1m, T1p;
256
                         T1n = ri[WS(rs, 18)];
257
                         T1q = ii[WS(rs, 18)];
258
                         T1m = W[34];
259
                         T1o = T1m * T1n;
260
                         T2Z = T1m * T1q;
261
                         T1p = W[35];
262
                         T1r = FMA(T1p, T1q, T1o);
263
                         T30 = FNMS(T1p, T1n, T2Z);
264
                    }
265
                    {
266
                         E T1u, T1x, T1v, T2F, T1t, T1w;
267
                         T1u = ri[WS(rs, 13)];
268
                         T1x = ii[WS(rs, 13)];
269
                         T1t = W[24];
270
                         T1v = T1t * T1u;
271
                         T2F = T1t * T1x;
272
                         T1w = W[25];
273
                         T1y = FMA(T1w, T1x, T1v);
274
                         T2G = FNMS(T1w, T1u, T2F);
275
                    }
276
                    {
277
                         E T1s, T1F, T3R, T3S;
278
                         T1s = T1l + T1r;
279
                         T1F = T1y + T1E;
280
                         T1G = T1s - T1F;
281
                         T2e = T1s + T1F;
282
                         T3R = T2Y + T30;
283
                         T3S = T2G + T2I;
284
                         T3T = T3R - T3S;
285
                         T4e = T3R + T3S;
286
                    }
287
                    {
288
                         E T2E, T2J, T31, T32;
289
                         T2E = T1l - T1r;
290
                         T2J = T2G - T2I;
291
                         T2K = T2E - T2J;
292
                         T3y = T2E + T2J;
293
                         T31 = T2Y - T30;
294
                         T32 = T1y - T1E;
295
                         T33 = T31 + T32;
296
                         T3I = T31 - T32;
297
                    }
298
               }
299
               {
300
                    E TT, T3k, T1c, T2A, TZ, T3m, T16, T2y;
301
                    {
302
                         E TP, TS, TQ, T3j, TO, TR;
303
                         TP = ri[WS(rs, 16)];
304
                         TS = ii[WS(rs, 16)];
305
                         TO = W[30];
306
                         TQ = TO * TP;
307
                         T3j = TO * TS;
308
                         TR = W[31];
309
                         TT = FMA(TR, TS, TQ);
310
                         T3k = FNMS(TR, TP, T3j);
311
                    }
312
                    {
313
                         E T18, T1b, T19, T2z, T17, T1a;
314
                         T18 = ri[WS(rs, 11)];
315
                         T1b = ii[WS(rs, 11)];
316
                         T17 = W[20];
317
                         T19 = T17 * T18;
318
                         T2z = T17 * T1b;
319
                         T1a = W[21];
320
                         T1c = FMA(T1a, T1b, T19);
321
                         T2A = FNMS(T1a, T18, T2z);
322
                    }
323
                    {
324
                         E TV, TY, TW, T3l, TU, TX;
325
                         TV = ri[WS(rs, 6)];
326
                         TY = ii[WS(rs, 6)];
327
                         TU = W[10];
328
                         TW = TU * TV;
329
                         T3l = TU * TY;
330
                         TX = W[11];
331
                         TZ = FMA(TX, TY, TW);
332
                         T3m = FNMS(TX, TV, T3l);
333
                    }
334
                    {
335
                         E T12, T15, T13, T2x, T11, T14;
336
                         T12 = ri[WS(rs, 1)];
337
                         T15 = ii[WS(rs, 1)];
338
                         T11 = W[0];
339
                         T13 = T11 * T12;
340
                         T2x = T11 * T15;
341
                         T14 = W[1];
342
                         T16 = FMA(T14, T15, T13);
343
                         T2y = FNMS(T14, T12, T2x);
344
                    }
345
                    {
346
                         E T10, T1d, T41, T42;
347
                         T10 = TT + TZ;
348
                         T1d = T16 + T1c;
349
                         T1e = T10 - T1d;
350
                         T2c = T10 + T1d;
351
                         T41 = T3k + T3m;
352
                         T42 = T2y + T2A;
353
                         T43 = T41 - T42;
354
                         T4c = T41 + T42;
355
                    }
356
                    {
357
                         E T2w, T2B, T3n, T3o;
358
                         T2w = TT - TZ;
359
                         T2B = T2y - T2A;
360
                         T2C = T2w - T2B;
361
                         T3w = T2w + T2B;
362
                         T3n = T3k - T3m;
363
                         T3o = T16 - T1c;
364
                         T3p = T3n + T3o;
365
                         T3G = T3n - T3o;
366
                    }
367
               }
368
               {
369
                    E T45, T47, Tm, T29, T3O, T3P, T46, T3Q;
370
                    {
371
                         E T3X, T44, T1f, T28;
372
                         T3X = T3T - T3W;
373
                         T44 = T40 - T43;
374
                         T45 = FNMS(KP618033988, T44, T3X);
375
                         T47 = FMA(KP618033988, T3X, T44);
376
                         Tm = T8 - Tl;
377
                         T1f = TN + T1e;
378
                         T28 = T1G + T27;
379
                         T29 = T1f + T28;
380
                         T3O = FNMS(KP250000000, T29, Tm);
381
                         T3P = T1f - T28;
382
                    }
383
                    ri[WS(rs, 10)] = Tm + T29;
384
                    T46 = FMA(KP559016994, T3P, T3O);
385
                    ri[WS(rs, 14)] = FNMS(KP951056516, T47, T46);
386
                    ri[WS(rs, 6)] = FMA(KP951056516, T47, T46);
387
                    T3Q = FNMS(KP559016994, T3P, T3O);
388
                    ri[WS(rs, 2)] = FNMS(KP951056516, T45, T3Q);
389
                    ri[WS(rs, 18)] = FMA(KP951056516, T45, T3Q);
390
               }
391
               {
392
                    E T4K, T4M, T4B, T4E, T4F, T4G, T4L, T4H;
393
                    {
394
                         E T4I, T4J, T4C, T4D;
395
                         T4I = T1G - T27;
396
                         T4J = TN - T1e;
397
                         T4K = FNMS(KP618033988, T4J, T4I);
398
                         T4M = FMA(KP618033988, T4I, T4J);
399
                         T4B = T4r - T4n;
400
                         T4C = T40 + T43;
401
                         T4D = T3T + T3W;
402
                         T4E = T4C + T4D;
403
                         T4F = FNMS(KP250000000, T4E, T4B);
404
                         T4G = T4C - T4D;
405
                    }
406
                    ii[WS(rs, 10)] = T4E + T4B;
407
                    T4L = FMA(KP559016994, T4G, T4F);
408
                    ii[WS(rs, 6)] = FNMS(KP951056516, T4M, T4L);
409
                    ii[WS(rs, 14)] = FMA(KP951056516, T4M, T4L);
410
                    T4H = FNMS(KP559016994, T4G, T4F);
411
                    ii[WS(rs, 2)] = FMA(KP951056516, T4K, T4H);
412
                    ii[WS(rs, 18)] = FNMS(KP951056516, T4K, T4H);
413
               }
414
               {
415
                    E T4h, T4j, T2a, T2h, T48, T49, T4i, T4a;
416
                    {
417
                         E T4d, T4g, T2d, T2g;
418
                         T4d = T4b - T4c;
419
                         T4g = T4e - T4f;
420
                         T4h = FMA(KP618033988, T4g, T4d);
421
                         T4j = FNMS(KP618033988, T4d, T4g);
422
                         T2a = T8 + Tl;
423
                         T2d = T2b + T2c;
424
                         T2g = T2e + T2f;
425
                         T2h = T2d + T2g;
426
                         T48 = FNMS(KP250000000, T2h, T2a);
427
                         T49 = T2d - T2g;
428
                    }
429
                    ri[0] = T2a + T2h;
430
                    T4i = FNMS(KP559016994, T49, T48);
431
                    ri[WS(rs, 12)] = FNMS(KP951056516, T4j, T4i);
432
                    ri[WS(rs, 8)] = FMA(KP951056516, T4j, T4i);
433
                    T4a = FMA(KP559016994, T49, T48);
434
                    ri[WS(rs, 4)] = FNMS(KP951056516, T4h, T4a);
435
                    ri[WS(rs, 16)] = FMA(KP951056516, T4h, T4a);
436
               }
437
               {
438
                    E T4y, T4A, T4s, T4m, T4t, T4u, T4z, T4v;
439
                    {
440
                         E T4w, T4x, T4k, T4l;
441
                         T4w = T2b - T2c;
442
                         T4x = T2e - T2f;
443
                         T4y = FMA(KP618033988, T4x, T4w);
444
                         T4A = FNMS(KP618033988, T4w, T4x);
445
                         T4s = T4n + T4r;
446
                         T4k = T4b + T4c;
447
                         T4l = T4e + T4f;
448
                         T4m = T4k + T4l;
449
                         T4t = FNMS(KP250000000, T4m, T4s);
450
                         T4u = T4k - T4l;
451
                    }
452
                    ii[0] = T4m + T4s;
453
                    T4z = FNMS(KP559016994, T4u, T4t);
454
                    ii[WS(rs, 8)] = FNMS(KP951056516, T4A, T4z);
455
                    ii[WS(rs, 12)] = FMA(KP951056516, T4A, T4z);
456
                    T4v = FMA(KP559016994, T4u, T4t);
457
                    ii[WS(rs, 4)] = FMA(KP951056516, T4y, T4v);
458
                    ii[WS(rs, 16)] = FNMS(KP951056516, T4y, T4v);
459
               }
460
               {
461
                    E T3r, T3t, T2o, T2T, T2U, T2V, T3s, T2W;
462
                    {
463
                         E T3b, T3q, T2D, T2S;
464
                         T3b = T33 - T3a;
465
                         T3q = T3i - T3p;
466
                         T3r = FNMS(KP618033988, T3q, T3b);
467
                         T3t = FMA(KP618033988, T3b, T3q);
468
                         T2o = T2i - T2n;
469
                         T2D = T2v + T2C;
470
                         T2S = T2K + T2R;
471
                         T2T = T2D + T2S;
472
                         T2U = FNMS(KP250000000, T2T, T2o);
473
                         T2V = T2D - T2S;
474
                    }
475
                    ri[WS(rs, 15)] = T2o + T2T;
476
                    T3s = FMA(KP559016994, T2V, T2U);
477
                    ri[WS(rs, 11)] = FMA(KP951056516, T3t, T3s);
478
                    ri[WS(rs, 19)] = FNMS(KP951056516, T3t, T3s);
479
                    T2W = FNMS(KP559016994, T2V, T2U);
480
                    ri[WS(rs, 3)] = FMA(KP951056516, T3r, T2W);
481
                    ri[WS(rs, 7)] = FNMS(KP951056516, T3r, T2W);
482
               }
483
               {
484
                    E T5a, T5c, T51, T54, T55, T56, T5b, T57;
485
                    {
486
                         E T58, T59, T52, T53;
487
                         T58 = T2K - T2R;
488
                         T59 = T2v - T2C;
489
                         T5a = FNMS(KP618033988, T59, T58);
490
                         T5c = FMA(KP618033988, T58, T59);
491
                         T51 = T4O + T4N;
492
                         T52 = T3i + T3p;
493
                         T53 = T33 + T3a;
494
                         T54 = T52 + T53;
495
                         T55 = FNMS(KP250000000, T54, T51);
496
                         T56 = T52 - T53;
497
                    }
498
                    ii[WS(rs, 15)] = T54 + T51;
499
                    T5b = FMA(KP559016994, T56, T55);
500
                    ii[WS(rs, 11)] = FNMS(KP951056516, T5c, T5b);
501
                    ii[WS(rs, 19)] = FMA(KP951056516, T5c, T5b);
502
                    T57 = FNMS(KP559016994, T56, T55);
503
                    ii[WS(rs, 3)] = FNMS(KP951056516, T5a, T57);
504
                    ii[WS(rs, 7)] = FMA(KP951056516, T5a, T57);
505
               }
506
               {
507
                    E T3L, T3N, T3u, T3B, T3C, T3D, T3M, T3E;
508
                    {
509
                         E T3H, T3K, T3x, T3A;
510
                         T3H = T3F - T3G;
511
                         T3K = T3I - T3J;
512
                         T3L = FMA(KP618033988, T3K, T3H);
513
                         T3N = FNMS(KP618033988, T3H, T3K);
514
                         T3u = T2i + T2n;
515
                         T3x = T3v + T3w;
516
                         T3A = T3y + T3z;
517
                         T3B = T3x + T3A;
518
                         T3C = FNMS(KP250000000, T3B, T3u);
519
                         T3D = T3x - T3A;
520
                    }
521
                    ri[WS(rs, 5)] = T3u + T3B;
522
                    T3M = FNMS(KP559016994, T3D, T3C);
523
                    ri[WS(rs, 13)] = FMA(KP951056516, T3N, T3M);
524
                    ri[WS(rs, 17)] = FNMS(KP951056516, T3N, T3M);
525
                    T3E = FMA(KP559016994, T3D, T3C);
526
                    ri[WS(rs, 1)] = FMA(KP951056516, T3L, T3E);
527
                    ri[WS(rs, 9)] = FNMS(KP951056516, T3L, T3E);
528
               }
529
               {
530
                    E T4Y, T50, T4P, T4S, T4T, T4U, T4Z, T4V;
531
                    {
532
                         E T4W, T4X, T4Q, T4R;
533
                         T4W = T3v - T3w;
534
                         T4X = T3y - T3z;
535
                         T4Y = FMA(KP618033988, T4X, T4W);
536
                         T50 = FNMS(KP618033988, T4W, T4X);
537
                         T4P = T4N - T4O;
538
                         T4Q = T3F + T3G;
539
                         T4R = T3I + T3J;
540
                         T4S = T4Q + T4R;
541
                         T4T = FNMS(KP250000000, T4S, T4P);
542
                         T4U = T4Q - T4R;
543
                    }
544
                    ii[WS(rs, 5)] = T4S + T4P;
545
                    T4Z = FNMS(KP559016994, T4U, T4T);
546
                    ii[WS(rs, 13)] = FNMS(KP951056516, T50, T4Z);
547
                    ii[WS(rs, 17)] = FMA(KP951056516, T50, T4Z);
548
                    T4V = FMA(KP559016994, T4U, T4T);
549
                    ii[WS(rs, 1)] = FNMS(KP951056516, T4Y, T4V);
550
                    ii[WS(rs, 9)] = FMA(KP951056516, T4Y, T4V);
551
               }
552
          }
553
     }
554
}
555

    
556
static const tw_instr twinstr[] = {
557
     {TW_FULL, 0, 20},
558
     {TW_NEXT, 1, 0}
559
};
560

    
561
static const ct_desc desc = { 20, "t1_20", twinstr, &GENUS, {136, 38, 110, 0}, 0, 0, 0 };
562

    
563
void X(codelet_t1_20) (planner *p) {
564
     X(kdft_dit_register) (p, t1_20, &desc);
565
}
566
#else
567

    
568
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 20 -name t1_20 -include dft/scalar/t.h */
569

    
570
/*
571
 * This function contains 246 FP additions, 124 FP multiplications,
572
 * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
573
 * 85 stack variables, 4 constants, and 80 memory accesses
574
 */
575
#include "dft/scalar/t.h"
576

    
577
static void t1_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
578
{
579
     DK(KP587785252, +0.587785252292473129168705954639072768597652438);
580
     DK(KP951056516, +0.951056516295153572116439333379382143405698634);
581
     DK(KP250000000, +0.250000000000000000000000000000000000000000000);
582
     DK(KP559016994, +0.559016994374947424102293417182819058860154590);
583
     {
584
          INT m;
585
          for (m = mb, W = W + (mb * 38); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
586
               E Tj, T1R, T4g, T4p, T2q, T37, T3Q, T42, T1r, T1O, T1P, T3i, T3l, T44, T3D;
587
               E T3E, T3K, T1V, T1W, T1X, T23, T28, T4r, T2W, T2X, T4c, T33, T34, T35, T2G;
588
               E T2L, T2M, TG, T13, T14, T3p, T3s, T43, T3A, T3B, T3J, T1S, T1T, T1U, T2e;
589
               E T2j, T4q, T2T, T2U, T4b, T30, T31, T32, T2v, T2A, T2B;
590
               {
591
                    E T1, T3O, T6, T3N, Tc, T2n, Th, T2o;
592
                    T1 = ri[0];
593
                    T3O = ii[0];
594
                    {
595
                         E T3, T5, T2, T4;
596
                         T3 = ri[WS(rs, 10)];
597
                         T5 = ii[WS(rs, 10)];
598
                         T2 = W[18];
599
                         T4 = W[19];
600
                         T6 = FMA(T2, T3, T4 * T5);
601
                         T3N = FNMS(T4, T3, T2 * T5);
602
                    }
603
                    {
604
                         E T9, Tb, T8, Ta;
605
                         T9 = ri[WS(rs, 5)];
606
                         Tb = ii[WS(rs, 5)];
607
                         T8 = W[8];
608
                         Ta = W[9];
609
                         Tc = FMA(T8, T9, Ta * Tb);
610
                         T2n = FNMS(Ta, T9, T8 * Tb);
611
                    }
612
                    {
613
                         E Te, Tg, Td, Tf;
614
                         Te = ri[WS(rs, 15)];
615
                         Tg = ii[WS(rs, 15)];
616
                         Td = W[28];
617
                         Tf = W[29];
618
                         Th = FMA(Td, Te, Tf * Tg);
619
                         T2o = FNMS(Tf, Te, Td * Tg);
620
                    }
621
                    {
622
                         E T7, Ti, T4e, T4f;
623
                         T7 = T1 + T6;
624
                         Ti = Tc + Th;
625
                         Tj = T7 - Ti;
626
                         T1R = T7 + Ti;
627
                         T4e = T3O - T3N;
628
                         T4f = Tc - Th;
629
                         T4g = T4e - T4f;
630
                         T4p = T4f + T4e;
631
                    }
632
                    {
633
                         E T2m, T2p, T3M, T3P;
634
                         T2m = T1 - T6;
635
                         T2p = T2n - T2o;
636
                         T2q = T2m - T2p;
637
                         T37 = T2m + T2p;
638
                         T3M = T2n + T2o;
639
                         T3P = T3N + T3O;
640
                         T3Q = T3M + T3P;
641
                         T42 = T3P - T3M;
642
                    }
643
               }
644
               {
645
                    E T1f, T3g, T21, T2C, T1N, T3k, T27, T2K, T1q, T3h, T22, T2F, T1C, T3j, T26;
646
                    E T2H;
647
                    {
648
                         E T19, T1Z, T1e, T20;
649
                         {
650
                              E T16, T18, T15, T17;
651
                              T16 = ri[WS(rs, 8)];
652
                              T18 = ii[WS(rs, 8)];
653
                              T15 = W[14];
654
                              T17 = W[15];
655
                              T19 = FMA(T15, T16, T17 * T18);
656
                              T1Z = FNMS(T17, T16, T15 * T18);
657
                         }
658
                         {
659
                              E T1b, T1d, T1a, T1c;
660
                              T1b = ri[WS(rs, 18)];
661
                              T1d = ii[WS(rs, 18)];
662
                              T1a = W[34];
663
                              T1c = W[35];
664
                              T1e = FMA(T1a, T1b, T1c * T1d);
665
                              T20 = FNMS(T1c, T1b, T1a * T1d);
666
                         }
667
                         T1f = T19 + T1e;
668
                         T3g = T1Z + T20;
669
                         T21 = T1Z - T20;
670
                         T2C = T19 - T1e;
671
                    }
672
                    {
673
                         E T1H, T2I, T1M, T2J;
674
                         {
675
                              E T1E, T1G, T1D, T1F;
676
                              T1E = ri[WS(rs, 17)];
677
                              T1G = ii[WS(rs, 17)];
678
                              T1D = W[32];
679
                              T1F = W[33];
680
                              T1H = FMA(T1D, T1E, T1F * T1G);
681
                              T2I = FNMS(T1F, T1E, T1D * T1G);
682
                         }
683
                         {
684
                              E T1J, T1L, T1I, T1K;
685
                              T1J = ri[WS(rs, 7)];
686
                              T1L = ii[WS(rs, 7)];
687
                              T1I = W[12];
688
                              T1K = W[13];
689
                              T1M = FMA(T1I, T1J, T1K * T1L);
690
                              T2J = FNMS(T1K, T1J, T1I * T1L);
691
                         }
692
                         T1N = T1H + T1M;
693
                         T3k = T2I + T2J;
694
                         T27 = T1H - T1M;
695
                         T2K = T2I - T2J;
696
                    }
697
                    {
698
                         E T1k, T2D, T1p, T2E;
699
                         {
700
                              E T1h, T1j, T1g, T1i;
701
                              T1h = ri[WS(rs, 13)];
702
                              T1j = ii[WS(rs, 13)];
703
                              T1g = W[24];
704
                              T1i = W[25];
705
                              T1k = FMA(T1g, T1h, T1i * T1j);
706
                              T2D = FNMS(T1i, T1h, T1g * T1j);
707
                         }
708
                         {
709
                              E T1m, T1o, T1l, T1n;
710
                              T1m = ri[WS(rs, 3)];
711
                              T1o = ii[WS(rs, 3)];
712
                              T1l = W[4];
713
                              T1n = W[5];
714
                              T1p = FMA(T1l, T1m, T1n * T1o);
715
                              T2E = FNMS(T1n, T1m, T1l * T1o);
716
                         }
717
                         T1q = T1k + T1p;
718
                         T3h = T2D + T2E;
719
                         T22 = T1k - T1p;
720
                         T2F = T2D - T2E;
721
                    }
722
                    {
723
                         E T1w, T24, T1B, T25;
724
                         {
725
                              E T1t, T1v, T1s, T1u;
726
                              T1t = ri[WS(rs, 12)];
727
                              T1v = ii[WS(rs, 12)];
728
                              T1s = W[22];
729
                              T1u = W[23];
730
                              T1w = FMA(T1s, T1t, T1u * T1v);
731
                              T24 = FNMS(T1u, T1t, T1s * T1v);
732
                         }
733
                         {
734
                              E T1y, T1A, T1x, T1z;
735
                              T1y = ri[WS(rs, 2)];
736
                              T1A = ii[WS(rs, 2)];
737
                              T1x = W[2];
738
                              T1z = W[3];
739
                              T1B = FMA(T1x, T1y, T1z * T1A);
740
                              T25 = FNMS(T1z, T1y, T1x * T1A);
741
                         }
742
                         T1C = T1w + T1B;
743
                         T3j = T24 + T25;
744
                         T26 = T24 - T25;
745
                         T2H = T1w - T1B;
746
                    }
747
                    T1r = T1f - T1q;
748
                    T1O = T1C - T1N;
749
                    T1P = T1r + T1O;
750
                    T3i = T3g - T3h;
751
                    T3l = T3j - T3k;
752
                    T44 = T3i + T3l;
753
                    T3D = T3g + T3h;
754
                    T3E = T3j + T3k;
755
                    T3K = T3D + T3E;
756
                    T1V = T1f + T1q;
757
                    T1W = T1C + T1N;
758
                    T1X = T1V + T1W;
759
                    T23 = T21 + T22;
760
                    T28 = T26 + T27;
761
                    T4r = T23 + T28;
762
                    T2W = T21 - T22;
763
                    T2X = T26 - T27;
764
                    T4c = T2W + T2X;
765
                    T33 = T2C + T2F;
766
                    T34 = T2H + T2K;
767
                    T35 = T33 + T34;
768
                    T2G = T2C - T2F;
769
                    T2L = T2H - T2K;
770
                    T2M = T2G + T2L;
771
               }
772
               {
773
                    E Tu, T3n, T2c, T2r, T12, T3r, T2i, T2z, TF, T3o, T2d, T2u, TR, T3q, T2h;
774
                    E T2w;
775
                    {
776
                         E To, T2a, Tt, T2b;
777
                         {
778
                              E Tl, Tn, Tk, Tm;
779
                              Tl = ri[WS(rs, 4)];
780
                              Tn = ii[WS(rs, 4)];
781
                              Tk = W[6];
782
                              Tm = W[7];
783
                              To = FMA(Tk, Tl, Tm * Tn);
784
                              T2a = FNMS(Tm, Tl, Tk * Tn);
785
                         }
786
                         {
787
                              E Tq, Ts, Tp, Tr;
788
                              Tq = ri[WS(rs, 14)];
789
                              Ts = ii[WS(rs, 14)];
790
                              Tp = W[26];
791
                              Tr = W[27];
792
                              Tt = FMA(Tp, Tq, Tr * Ts);
793
                              T2b = FNMS(Tr, Tq, Tp * Ts);
794
                         }
795
                         Tu = To + Tt;
796
                         T3n = T2a + T2b;
797
                         T2c = T2a - T2b;
798
                         T2r = To - Tt;
799
                    }
800
                    {
801
                         E TW, T2x, T11, T2y;
802
                         {
803
                              E TT, TV, TS, TU;
804
                              TT = ri[WS(rs, 1)];
805
                              TV = ii[WS(rs, 1)];
806
                              TS = W[0];
807
                              TU = W[1];
808
                              TW = FMA(TS, TT, TU * TV);
809
                              T2x = FNMS(TU, TT, TS * TV);
810
                         }
811
                         {
812
                              E TY, T10, TX, TZ;
813
                              TY = ri[WS(rs, 11)];
814
                              T10 = ii[WS(rs, 11)];
815
                              TX = W[20];
816
                              TZ = W[21];
817
                              T11 = FMA(TX, TY, TZ * T10);
818
                              T2y = FNMS(TZ, TY, TX * T10);
819
                         }
820
                         T12 = TW + T11;
821
                         T3r = T2x + T2y;
822
                         T2i = TW - T11;
823
                         T2z = T2x - T2y;
824
                    }
825
                    {
826
                         E Tz, T2s, TE, T2t;
827
                         {
828
                              E Tw, Ty, Tv, Tx;
829
                              Tw = ri[WS(rs, 9)];
830
                              Ty = ii[WS(rs, 9)];
831
                              Tv = W[16];
832
                              Tx = W[17];
833
                              Tz = FMA(Tv, Tw, Tx * Ty);
834
                              T2s = FNMS(Tx, Tw, Tv * Ty);
835
                         }
836
                         {
837
                              E TB, TD, TA, TC;
838
                              TB = ri[WS(rs, 19)];
839
                              TD = ii[WS(rs, 19)];
840
                              TA = W[36];
841
                              TC = W[37];
842
                              TE = FMA(TA, TB, TC * TD);
843
                              T2t = FNMS(TC, TB, TA * TD);
844
                         }
845
                         TF = Tz + TE;
846
                         T3o = T2s + T2t;
847
                         T2d = Tz - TE;
848
                         T2u = T2s - T2t;
849
                    }
850
                    {
851
                         E TL, T2f, TQ, T2g;
852
                         {
853
                              E TI, TK, TH, TJ;
854
                              TI = ri[WS(rs, 16)];
855
                              TK = ii[WS(rs, 16)];
856
                              TH = W[30];
857
                              TJ = W[31];
858
                              TL = FMA(TH, TI, TJ * TK);
859
                              T2f = FNMS(TJ, TI, TH * TK);
860
                         }
861
                         {
862
                              E TN, TP, TM, TO;
863
                              TN = ri[WS(rs, 6)];
864
                              TP = ii[WS(rs, 6)];
865
                              TM = W[10];
866
                              TO = W[11];
867
                              TQ = FMA(TM, TN, TO * TP);
868
                              T2g = FNMS(TO, TN, TM * TP);
869
                         }
870
                         TR = TL + TQ;
871
                         T3q = T2f + T2g;
872
                         T2h = T2f - T2g;
873
                         T2w = TL - TQ;
874
                    }
875
                    TG = Tu - TF;
876
                    T13 = TR - T12;
877
                    T14 = TG + T13;
878
                    T3p = T3n - T3o;
879
                    T3s = T3q - T3r;
880
                    T43 = T3p + T3s;
881
                    T3A = T3n + T3o;
882
                    T3B = T3q + T3r;
883
                    T3J = T3A + T3B;
884
                    T1S = Tu + TF;
885
                    T1T = TR + T12;
886
                    T1U = T1S + T1T;
887
                    T2e = T2c + T2d;
888
                    T2j = T2h + T2i;
889
                    T4q = T2e + T2j;
890
                    T2T = T2c - T2d;
891
                    T2U = T2h - T2i;
892
                    T4b = T2T + T2U;
893
                    T30 = T2r + T2u;
894
                    T31 = T2w + T2z;
895
                    T32 = T30 + T31;
896
                    T2v = T2r - T2u;
897
                    T2A = T2w - T2z;
898
                    T2B = T2v + T2A;
899
               }
900
               {
901
                    E T3e, T1Q, T3d, T3u, T3w, T3m, T3t, T3v, T3f;
902
                    T3e = KP559016994 * (T14 - T1P);
903
                    T1Q = T14 + T1P;
904
                    T3d = FNMS(KP250000000, T1Q, Tj);
905
                    T3m = T3i - T3l;
906
                    T3t = T3p - T3s;
907
                    T3u = FNMS(KP587785252, T3t, KP951056516 * T3m);
908
                    T3w = FMA(KP951056516, T3t, KP587785252 * T3m);
909
                    ri[WS(rs, 10)] = Tj + T1Q;
910
                    T3v = T3e + T3d;
911
                    ri[WS(rs, 14)] = T3v - T3w;
912
                    ri[WS(rs, 6)] = T3v + T3w;
913
                    T3f = T3d - T3e;
914
                    ri[WS(rs, 2)] = T3f - T3u;
915
                    ri[WS(rs, 18)] = T3f + T3u;
916
               }
917
               {
918
                    E T47, T45, T46, T41, T4a, T3Z, T40, T49, T48;
919
                    T47 = KP559016994 * (T43 - T44);
920
                    T45 = T43 + T44;
921
                    T46 = FNMS(KP250000000, T45, T42);
922
                    T3Z = T1r - T1O;
923
                    T40 = TG - T13;
924
                    T41 = FNMS(KP587785252, T40, KP951056516 * T3Z);
925
                    T4a = FMA(KP951056516, T40, KP587785252 * T3Z);
926
                    ii[WS(rs, 10)] = T45 + T42;
927
                    T49 = T47 + T46;
928
                    ii[WS(rs, 6)] = T49 - T4a;
929
                    ii[WS(rs, 14)] = T4a + T49;
930
                    T48 = T46 - T47;
931
                    ii[WS(rs, 2)] = T41 + T48;
932
                    ii[WS(rs, 18)] = T48 - T41;
933
               }
934
               {
935
                    E T3x, T1Y, T3y, T3G, T3I, T3C, T3F, T3H, T3z;
936
                    T3x = KP559016994 * (T1U - T1X);
937
                    T1Y = T1U + T1X;
938
                    T3y = FNMS(KP250000000, T1Y, T1R);
939
                    T3C = T3A - T3B;
940
                    T3F = T3D - T3E;
941
                    T3G = FMA(KP951056516, T3C, KP587785252 * T3F);
942
                    T3I = FNMS(KP587785252, T3C, KP951056516 * T3F);
943
                    ri[0] = T1R + T1Y;
944
                    T3H = T3y - T3x;
945
                    ri[WS(rs, 12)] = T3H - T3I;
946
                    ri[WS(rs, 8)] = T3H + T3I;
947
                    T3z = T3x + T3y;
948
                    ri[WS(rs, 4)] = T3z - T3G;
949
                    ri[WS(rs, 16)] = T3z + T3G;
950
               }
951
               {
952
                    E T3U, T3L, T3V, T3T, T3Y, T3R, T3S, T3X, T3W;
953
                    T3U = KP559016994 * (T3J - T3K);
954
                    T3L = T3J + T3K;
955
                    T3V = FNMS(KP250000000, T3L, T3Q);
956
                    T3R = T1S - T1T;
957
                    T3S = T1V - T1W;
958
                    T3T = FMA(KP951056516, T3R, KP587785252 * T3S);
959
                    T3Y = FNMS(KP587785252, T3R, KP951056516 * T3S);
960
                    ii[0] = T3L + T3Q;
961
                    T3X = T3V - T3U;
962
                    ii[WS(rs, 8)] = T3X - T3Y;
963
                    ii[WS(rs, 12)] = T3Y + T3X;
964
                    T3W = T3U + T3V;
965
                    ii[WS(rs, 4)] = T3T + T3W;
966
                    ii[WS(rs, 16)] = T3W - T3T;
967
               }
968
               {
969
                    E T2P, T2N, T2O, T2l, T2R, T29, T2k, T2S, T2Q;
970
                    T2P = KP559016994 * (T2B - T2M);
971
                    T2N = T2B + T2M;
972
                    T2O = FNMS(KP250000000, T2N, T2q);
973
                    T29 = T23 - T28;
974
                    T2k = T2e - T2j;
975
                    T2l = FNMS(KP587785252, T2k, KP951056516 * T29);
976
                    T2R = FMA(KP951056516, T2k, KP587785252 * T29);
977
                    ri[WS(rs, 15)] = T2q + T2N;
978
                    T2S = T2P + T2O;
979
                    ri[WS(rs, 11)] = T2R + T2S;
980
                    ri[WS(rs, 19)] = T2S - T2R;
981
                    T2Q = T2O - T2P;
982
                    ri[WS(rs, 3)] = T2l + T2Q;
983
                    ri[WS(rs, 7)] = T2Q - T2l;
984
               }
985
               {
986
                    E T4u, T4s, T4t, T4y, T4A, T4w, T4x, T4z, T4v;
987
                    T4u = KP559016994 * (T4q - T4r);
988
                    T4s = T4q + T4r;
989
                    T4t = FNMS(KP250000000, T4s, T4p);
990
                    T4w = T2G - T2L;
991
                    T4x = T2v - T2A;
992
                    T4y = FNMS(KP587785252, T4x, KP951056516 * T4w);
993
                    T4A = FMA(KP951056516, T4x, KP587785252 * T4w);
994
                    ii[WS(rs, 15)] = T4s + T4p;
995
                    T4z = T4u + T4t;
996
                    ii[WS(rs, 11)] = T4z - T4A;
997
                    ii[WS(rs, 19)] = T4A + T4z;
998
                    T4v = T4t - T4u;
999
                    ii[WS(rs, 3)] = T4v - T4y;
1000
                    ii[WS(rs, 7)] = T4y + T4v;
1001
               }
1002
               {
1003
                    E T36, T38, T39, T2Z, T3b, T2V, T2Y, T3c, T3a;
1004
                    T36 = KP559016994 * (T32 - T35);
1005
                    T38 = T32 + T35;
1006
                    T39 = FNMS(KP250000000, T38, T37);
1007
                    T2V = T2T - T2U;
1008
                    T2Y = T2W - T2X;
1009
                    T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y);
1010
                    T3b = FNMS(KP587785252, T2V, KP951056516 * T2Y);
1011
                    ri[WS(rs, 5)] = T37 + T38;
1012
                    T3c = T39 - T36;
1013
                    ri[WS(rs, 13)] = T3b + T3c;
1014
                    ri[WS(rs, 17)] = T3c - T3b;
1015
                    T3a = T36 + T39;
1016
                    ri[WS(rs, 1)] = T2Z + T3a;
1017
                    ri[WS(rs, 9)] = T3a - T2Z;
1018
               }
1019
               {
1020
                    E T4d, T4h, T4i, T4m, T4o, T4k, T4l, T4n, T4j;
1021
                    T4d = KP559016994 * (T4b - T4c);
1022
                    T4h = T4b + T4c;
1023
                    T4i = FNMS(KP250000000, T4h, T4g);
1024
                    T4k = T30 - T31;
1025
                    T4l = T33 - T34;
1026
                    T4m = FMA(KP951056516, T4k, KP587785252 * T4l);
1027
                    T4o = FNMS(KP587785252, T4k, KP951056516 * T4l);
1028
                    ii[WS(rs, 5)] = T4h + T4g;
1029
                    T4n = T4i - T4d;
1030
                    ii[WS(rs, 13)] = T4n - T4o;
1031
                    ii[WS(rs, 17)] = T4o + T4n;
1032
                    T4j = T4d + T4i;
1033
                    ii[WS(rs, 1)] = T4j - T4m;
1034
                    ii[WS(rs, 9)] = T4m + T4j;
1035
               }
1036
          }
1037
     }
1038
}
1039

    
1040
static const tw_instr twinstr[] = {
1041
     {TW_FULL, 0, 20},
1042
     {TW_NEXT, 1, 0}
1043
};
1044

    
1045
static const ct_desc desc = { 20, "t1_20", twinstr, &GENUS, {184, 62, 62, 0}, 0, 0, 0 };
1046

    
1047
void X(codelet_t1_20) (planner *p) {
1048
     X(kdft_dit_register) (p, t1_20, &desc);
1049
}
1050
#endif