comparison src/fftw-3.3.8/dft/simd/common/t2fv_20.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:05:46 EDT 2018 */
23
24 #include "dft/codelet-dft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2fv_20 -include dft/simd/t2f.h */
29
30 /*
31 * This function contains 123 FP additions, 88 FP multiplications,
32 * (or, 77 additions, 42 multiplications, 46 fused multiply/add),
33 * 54 stack variables, 4 constants, and 40 memory accesses
34 */
35 #include "dft/simd/t2f.h"
36
37 static void t2fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
43 {
44 INT m;
45 R *x;
46 x = ri;
47 for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
48 V T4, Tx, T1m, T1K, TZ, T16, T17, T10, Tf, Tq, Tr, T1O, T1P, T1Q, T1w;
49 V T1z, T1A, TI, TT, TU, T1L, T1M, T1N, T1p, T1s, T1t, Ts, TV;
50 {
51 V T1, Tw, T3, Tu, Tv, T2, Tt, T1k, T1l;
52 T1 = LD(&(x[0]), ms, &(x[0]));
53 Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
54 Tw = BYTWJ(&(W[TWVL * 28]), Tv);
55 T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
56 T3 = BYTWJ(&(W[TWVL * 18]), T2);
57 Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
58 Tu = BYTWJ(&(W[TWVL * 8]), Tt);
59 T4 = VSUB(T1, T3);
60 Tx = VSUB(Tu, Tw);
61 T1k = VADD(T1, T3);
62 T1l = VADD(Tu, Tw);
63 T1m = VSUB(T1k, T1l);
64 T1K = VADD(T1k, T1l);
65 }
66 {
67 V T9, T1n, TN, T1v, TS, T1y, Te, T1q, Tk, T1u, TC, T1o, TH, T1r, Tp;
68 V T1x;
69 {
70 V T6, T8, T5, T7;
71 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
72 T6 = BYTWJ(&(W[TWVL * 6]), T5);
73 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
74 T8 = BYTWJ(&(W[TWVL * 26]), T7);
75 T9 = VSUB(T6, T8);
76 T1n = VADD(T6, T8);
77 }
78 {
79 V TK, TM, TJ, TL;
80 TJ = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
81 TK = BYTWJ(&(W[TWVL * 24]), TJ);
82 TL = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
83 TM = BYTWJ(&(W[TWVL * 4]), TL);
84 TN = VSUB(TK, TM);
85 T1v = VADD(TK, TM);
86 }
87 {
88 V TP, TR, TO, TQ;
89 TO = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
90 TP = BYTWJ(&(W[TWVL * 32]), TO);
91 TQ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
92 TR = BYTWJ(&(W[TWVL * 12]), TQ);
93 TS = VSUB(TP, TR);
94 T1y = VADD(TP, TR);
95 }
96 {
97 V Tb, Td, Ta, Tc;
98 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
99 Tb = BYTWJ(&(W[TWVL * 30]), Ta);
100 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
101 Td = BYTWJ(&(W[TWVL * 10]), Tc);
102 Te = VSUB(Tb, Td);
103 T1q = VADD(Tb, Td);
104 }
105 {
106 V Th, Tj, Tg, Ti;
107 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
108 Th = BYTWJ(&(W[TWVL * 14]), Tg);
109 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
110 Tj = BYTWJ(&(W[TWVL * 34]), Ti);
111 Tk = VSUB(Th, Tj);
112 T1u = VADD(Th, Tj);
113 }
114 {
115 V Tz, TB, Ty, TA;
116 Ty = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
117 Tz = BYTWJ(&(W[TWVL * 16]), Ty);
118 TA = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
119 TB = BYTWJ(&(W[TWVL * 36]), TA);
120 TC = VSUB(Tz, TB);
121 T1o = VADD(Tz, TB);
122 }
123 {
124 V TE, TG, TD, TF;
125 TD = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
126 TE = BYTWJ(&(W[0]), TD);
127 TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
128 TG = BYTWJ(&(W[TWVL * 20]), TF);
129 TH = VSUB(TE, TG);
130 T1r = VADD(TE, TG);
131 }
132 {
133 V Tm, To, Tl, Tn;
134 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
135 Tm = BYTWJ(&(W[TWVL * 22]), Tl);
136 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
137 To = BYTWJ(&(W[TWVL * 2]), Tn);
138 Tp = VSUB(Tm, To);
139 T1x = VADD(Tm, To);
140 }
141 TZ = VSUB(TH, TC);
142 T16 = VSUB(T9, Te);
143 T17 = VSUB(Tk, Tp);
144 T10 = VSUB(TS, TN);
145 Tf = VADD(T9, Te);
146 Tq = VADD(Tk, Tp);
147 Tr = VADD(Tf, Tq);
148 T1O = VADD(T1u, T1v);
149 T1P = VADD(T1x, T1y);
150 T1Q = VADD(T1O, T1P);
151 T1w = VSUB(T1u, T1v);
152 T1z = VSUB(T1x, T1y);
153 T1A = VADD(T1w, T1z);
154 TI = VADD(TC, TH);
155 TT = VADD(TN, TS);
156 TU = VADD(TI, TT);
157 T1L = VADD(T1n, T1o);
158 T1M = VADD(T1q, T1r);
159 T1N = VADD(T1L, T1M);
160 T1p = VSUB(T1n, T1o);
161 T1s = VSUB(T1q, T1r);
162 T1t = VADD(T1p, T1s);
163 }
164 Ts = VADD(T4, Tr);
165 TV = VADD(Tx, TU);
166 ST(&(x[WS(rs, 5)]), VFNMSI(TV, Ts), ms, &(x[WS(rs, 1)]));
167 ST(&(x[WS(rs, 15)]), VFMAI(TV, Ts), ms, &(x[WS(rs, 1)]));
168 {
169 V T1T, T1R, T1S, T1X, T1Z, T1V, T1W, T1Y, T1U;
170 T1T = VSUB(T1N, T1Q);
171 T1R = VADD(T1N, T1Q);
172 T1S = VFNMS(LDK(KP250000000), T1R, T1K);
173 T1V = VSUB(T1L, T1M);
174 T1W = VSUB(T1O, T1P);
175 T1X = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1W, T1V));
176 T1Z = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1V, T1W));
177 ST(&(x[0]), VADD(T1K, T1R), ms, &(x[0]));
178 T1Y = VFNMS(LDK(KP559016994), T1T, T1S);
179 ST(&(x[WS(rs, 8)]), VFNMSI(T1Z, T1Y), ms, &(x[0]));
180 ST(&(x[WS(rs, 12)]), VFMAI(T1Z, T1Y), ms, &(x[0]));
181 T1U = VFMA(LDK(KP559016994), T1T, T1S);
182 ST(&(x[WS(rs, 4)]), VFMAI(T1X, T1U), ms, &(x[0]));
183 ST(&(x[WS(rs, 16)]), VFNMSI(T1X, T1U), ms, &(x[0]));
184 }
185 {
186 V T1D, T1B, T1C, T1H, T1J, T1F, T1G, T1I, T1E;
187 T1D = VSUB(T1t, T1A);
188 T1B = VADD(T1t, T1A);
189 T1C = VFNMS(LDK(KP250000000), T1B, T1m);
190 T1F = VSUB(T1w, T1z);
191 T1G = VSUB(T1p, T1s);
192 T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
193 T1J = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
194 ST(&(x[WS(rs, 10)]), VADD(T1m, T1B), ms, &(x[0]));
195 T1I = VFMA(LDK(KP559016994), T1D, T1C);
196 ST(&(x[WS(rs, 6)]), VFNMSI(T1J, T1I), ms, &(x[0]));
197 ST(&(x[WS(rs, 14)]), VFMAI(T1J, T1I), ms, &(x[0]));
198 T1E = VFNMS(LDK(KP559016994), T1D, T1C);
199 ST(&(x[WS(rs, 2)]), VFMAI(T1H, T1E), ms, &(x[0]));
200 ST(&(x[WS(rs, 18)]), VFNMSI(T1H, T1E), ms, &(x[0]));
201 }
202 {
203 V T11, T18, T1g, T1d, T15, T1f, TY, T1c;
204 T11 = VFMA(LDK(KP618033988), T10, TZ);
205 T18 = VFMA(LDK(KP618033988), T17, T16);
206 T1g = VFNMS(LDK(KP618033988), T16, T17);
207 T1d = VFNMS(LDK(KP618033988), TZ, T10);
208 {
209 V T13, T14, TW, TX;
210 T13 = VFNMS(LDK(KP250000000), TU, Tx);
211 T14 = VSUB(TT, TI);
212 T15 = VFNMS(LDK(KP559016994), T14, T13);
213 T1f = VFMA(LDK(KP559016994), T14, T13);
214 TW = VFNMS(LDK(KP250000000), Tr, T4);
215 TX = VSUB(Tf, Tq);
216 TY = VFMA(LDK(KP559016994), TX, TW);
217 T1c = VFNMS(LDK(KP559016994), TX, TW);
218 }
219 {
220 V T12, T19, T1i, T1j;
221 T12 = VFMA(LDK(KP951056516), T11, TY);
222 T19 = VFMA(LDK(KP951056516), T18, T15);
223 ST(&(x[WS(rs, 1)]), VFNMSI(T19, T12), ms, &(x[WS(rs, 1)]));
224 ST(&(x[WS(rs, 19)]), VFMAI(T19, T12), ms, &(x[WS(rs, 1)]));
225 T1i = VFMA(LDK(KP951056516), T1d, T1c);
226 T1j = VFMA(LDK(KP951056516), T1g, T1f);
227 ST(&(x[WS(rs, 13)]), VFNMSI(T1j, T1i), ms, &(x[WS(rs, 1)]));
228 ST(&(x[WS(rs, 7)]), VFMAI(T1j, T1i), ms, &(x[WS(rs, 1)]));
229 }
230 {
231 V T1a, T1b, T1e, T1h;
232 T1a = VFNMS(LDK(KP951056516), T11, TY);
233 T1b = VFNMS(LDK(KP951056516), T18, T15);
234 ST(&(x[WS(rs, 9)]), VFNMSI(T1b, T1a), ms, &(x[WS(rs, 1)]));
235 ST(&(x[WS(rs, 11)]), VFMAI(T1b, T1a), ms, &(x[WS(rs, 1)]));
236 T1e = VFNMS(LDK(KP951056516), T1d, T1c);
237 T1h = VFNMS(LDK(KP951056516), T1g, T1f);
238 ST(&(x[WS(rs, 17)]), VFNMSI(T1h, T1e), ms, &(x[WS(rs, 1)]));
239 ST(&(x[WS(rs, 3)]), VFMAI(T1h, T1e), ms, &(x[WS(rs, 1)]));
240 }
241 }
242 }
243 }
244 VLEAVE();
245 }
246
247 static const tw_instr twinstr[] = {
248 VTW(0, 1),
249 VTW(0, 2),
250 VTW(0, 3),
251 VTW(0, 4),
252 VTW(0, 5),
253 VTW(0, 6),
254 VTW(0, 7),
255 VTW(0, 8),
256 VTW(0, 9),
257 VTW(0, 10),
258 VTW(0, 11),
259 VTW(0, 12),
260 VTW(0, 13),
261 VTW(0, 14),
262 VTW(0, 15),
263 VTW(0, 16),
264 VTW(0, 17),
265 VTW(0, 18),
266 VTW(0, 19),
267 {TW_NEXT, VL, 0}
268 };
269
270 static const ct_desc desc = { 20, XSIMD_STRING("t2fv_20"), twinstr, &GENUS, {77, 42, 46, 0}, 0, 0, 0 };
271
272 void XSIMD(codelet_t2fv_20) (planner *p) {
273 X(kdft_dit_register) (p, t2fv_20, &desc);
274 }
275 #else
276
277 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2fv_20 -include dft/simd/t2f.h */
278
279 /*
280 * This function contains 123 FP additions, 62 FP multiplications,
281 * (or, 111 additions, 50 multiplications, 12 fused multiply/add),
282 * 54 stack variables, 4 constants, and 40 memory accesses
283 */
284 #include "dft/simd/t2f.h"
285
286 static void t2fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
287 {
288 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
289 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
290 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
291 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
292 {
293 INT m;
294 R *x;
295 x = ri;
296 for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
297 V T4, Tx, T1B, T1U, TZ, T16, T17, T10, Tf, Tq, Tr, T1N, T1O, T1S, T1t;
298 V T1w, T1C, TI, TT, TU, T1K, T1L, T1R, T1m, T1p, T1D, Ts, TV;
299 {
300 V T1, Tw, T3, Tu, Tv, T2, Tt, T1z, T1A;
301 T1 = LD(&(x[0]), ms, &(x[0]));
302 Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
303 Tw = BYTWJ(&(W[TWVL * 28]), Tv);
304 T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
305 T3 = BYTWJ(&(W[TWVL * 18]), T2);
306 Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
307 Tu = BYTWJ(&(W[TWVL * 8]), Tt);
308 T4 = VSUB(T1, T3);
309 Tx = VSUB(Tu, Tw);
310 T1z = VADD(T1, T3);
311 T1A = VADD(Tu, Tw);
312 T1B = VSUB(T1z, T1A);
313 T1U = VADD(T1z, T1A);
314 }
315 {
316 V T9, T1r, TN, T1l, TS, T1o, Te, T1u, Tk, T1k, TC, T1s, TH, T1v, Tp;
317 V T1n;
318 {
319 V T6, T8, T5, T7;
320 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
321 T6 = BYTWJ(&(W[TWVL * 6]), T5);
322 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
323 T8 = BYTWJ(&(W[TWVL * 26]), T7);
324 T9 = VSUB(T6, T8);
325 T1r = VADD(T6, T8);
326 }
327 {
328 V TK, TM, TJ, TL;
329 TJ = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
330 TK = BYTWJ(&(W[TWVL * 24]), TJ);
331 TL = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
332 TM = BYTWJ(&(W[TWVL * 4]), TL);
333 TN = VSUB(TK, TM);
334 T1l = VADD(TK, TM);
335 }
336 {
337 V TP, TR, TO, TQ;
338 TO = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
339 TP = BYTWJ(&(W[TWVL * 32]), TO);
340 TQ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
341 TR = BYTWJ(&(W[TWVL * 12]), TQ);
342 TS = VSUB(TP, TR);
343 T1o = VADD(TP, TR);
344 }
345 {
346 V Tb, Td, Ta, Tc;
347 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
348 Tb = BYTWJ(&(W[TWVL * 30]), Ta);
349 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
350 Td = BYTWJ(&(W[TWVL * 10]), Tc);
351 Te = VSUB(Tb, Td);
352 T1u = VADD(Tb, Td);
353 }
354 {
355 V Th, Tj, Tg, Ti;
356 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
357 Th = BYTWJ(&(W[TWVL * 14]), Tg);
358 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
359 Tj = BYTWJ(&(W[TWVL * 34]), Ti);
360 Tk = VSUB(Th, Tj);
361 T1k = VADD(Th, Tj);
362 }
363 {
364 V Tz, TB, Ty, TA;
365 Ty = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
366 Tz = BYTWJ(&(W[TWVL * 16]), Ty);
367 TA = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
368 TB = BYTWJ(&(W[TWVL * 36]), TA);
369 TC = VSUB(Tz, TB);
370 T1s = VADD(Tz, TB);
371 }
372 {
373 V TE, TG, TD, TF;
374 TD = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
375 TE = BYTWJ(&(W[0]), TD);
376 TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
377 TG = BYTWJ(&(W[TWVL * 20]), TF);
378 TH = VSUB(TE, TG);
379 T1v = VADD(TE, TG);
380 }
381 {
382 V Tm, To, Tl, Tn;
383 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
384 Tm = BYTWJ(&(W[TWVL * 22]), Tl);
385 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
386 To = BYTWJ(&(W[TWVL * 2]), Tn);
387 Tp = VSUB(Tm, To);
388 T1n = VADD(Tm, To);
389 }
390 TZ = VSUB(TH, TC);
391 T16 = VSUB(T9, Te);
392 T17 = VSUB(Tk, Tp);
393 T10 = VSUB(TS, TN);
394 Tf = VADD(T9, Te);
395 Tq = VADD(Tk, Tp);
396 Tr = VADD(Tf, Tq);
397 T1N = VADD(T1k, T1l);
398 T1O = VADD(T1n, T1o);
399 T1S = VADD(T1N, T1O);
400 T1t = VSUB(T1r, T1s);
401 T1w = VSUB(T1u, T1v);
402 T1C = VADD(T1t, T1w);
403 TI = VADD(TC, TH);
404 TT = VADD(TN, TS);
405 TU = VADD(TI, TT);
406 T1K = VADD(T1r, T1s);
407 T1L = VADD(T1u, T1v);
408 T1R = VADD(T1K, T1L);
409 T1m = VSUB(T1k, T1l);
410 T1p = VSUB(T1n, T1o);
411 T1D = VADD(T1m, T1p);
412 }
413 Ts = VADD(T4, Tr);
414 TV = VBYI(VADD(Tx, TU));
415 ST(&(x[WS(rs, 5)]), VSUB(Ts, TV), ms, &(x[WS(rs, 1)]));
416 ST(&(x[WS(rs, 15)]), VADD(Ts, TV), ms, &(x[WS(rs, 1)]));
417 {
418 V T1T, T1V, T1W, T1Q, T1Z, T1M, T1P, T1Y, T1X;
419 T1T = VMUL(LDK(KP559016994), VSUB(T1R, T1S));
420 T1V = VADD(T1R, T1S);
421 T1W = VFNMS(LDK(KP250000000), T1V, T1U);
422 T1M = VSUB(T1K, T1L);
423 T1P = VSUB(T1N, T1O);
424 T1Q = VBYI(VFMA(LDK(KP951056516), T1M, VMUL(LDK(KP587785252), T1P)));
425 T1Z = VBYI(VFNMS(LDK(KP587785252), T1M, VMUL(LDK(KP951056516), T1P)));
426 ST(&(x[0]), VADD(T1U, T1V), ms, &(x[0]));
427 T1Y = VSUB(T1W, T1T);
428 ST(&(x[WS(rs, 8)]), VSUB(T1Y, T1Z), ms, &(x[0]));
429 ST(&(x[WS(rs, 12)]), VADD(T1Z, T1Y), ms, &(x[0]));
430 T1X = VADD(T1T, T1W);
431 ST(&(x[WS(rs, 4)]), VADD(T1Q, T1X), ms, &(x[0]));
432 ST(&(x[WS(rs, 16)]), VSUB(T1X, T1Q), ms, &(x[0]));
433 }
434 {
435 V T1G, T1E, T1F, T1y, T1J, T1q, T1x, T1I, T1H;
436 T1G = VMUL(LDK(KP559016994), VSUB(T1C, T1D));
437 T1E = VADD(T1C, T1D);
438 T1F = VFNMS(LDK(KP250000000), T1E, T1B);
439 T1q = VSUB(T1m, T1p);
440 T1x = VSUB(T1t, T1w);
441 T1y = VBYI(VFNMS(LDK(KP587785252), T1x, VMUL(LDK(KP951056516), T1q)));
442 T1J = VBYI(VFMA(LDK(KP951056516), T1x, VMUL(LDK(KP587785252), T1q)));
443 ST(&(x[WS(rs, 10)]), VADD(T1B, T1E), ms, &(x[0]));
444 T1I = VADD(T1G, T1F);
445 ST(&(x[WS(rs, 6)]), VSUB(T1I, T1J), ms, &(x[0]));
446 ST(&(x[WS(rs, 14)]), VADD(T1J, T1I), ms, &(x[0]));
447 T1H = VSUB(T1F, T1G);
448 ST(&(x[WS(rs, 2)]), VADD(T1y, T1H), ms, &(x[0]));
449 ST(&(x[WS(rs, 18)]), VSUB(T1H, T1y), ms, &(x[0]));
450 }
451 {
452 V T11, T18, T1g, T1d, T15, T1f, TY, T1c;
453 T11 = VFMA(LDK(KP951056516), TZ, VMUL(LDK(KP587785252), T10));
454 T18 = VFMA(LDK(KP951056516), T16, VMUL(LDK(KP587785252), T17));
455 T1g = VFNMS(LDK(KP587785252), T16, VMUL(LDK(KP951056516), T17));
456 T1d = VFNMS(LDK(KP587785252), TZ, VMUL(LDK(KP951056516), T10));
457 {
458 V T13, T14, TW, TX;
459 T13 = VFMS(LDK(KP250000000), TU, Tx);
460 T14 = VMUL(LDK(KP559016994), VSUB(TT, TI));
461 T15 = VADD(T13, T14);
462 T1f = VSUB(T14, T13);
463 TW = VMUL(LDK(KP559016994), VSUB(Tf, Tq));
464 TX = VFNMS(LDK(KP250000000), Tr, T4);
465 TY = VADD(TW, TX);
466 T1c = VSUB(TX, TW);
467 }
468 {
469 V T12, T19, T1i, T1j;
470 T12 = VADD(TY, T11);
471 T19 = VBYI(VSUB(T15, T18));
472 ST(&(x[WS(rs, 19)]), VSUB(T12, T19), ms, &(x[WS(rs, 1)]));
473 ST(&(x[WS(rs, 1)]), VADD(T12, T19), ms, &(x[WS(rs, 1)]));
474 T1i = VADD(T1c, T1d);
475 T1j = VBYI(VADD(T1g, T1f));
476 ST(&(x[WS(rs, 13)]), VSUB(T1i, T1j), ms, &(x[WS(rs, 1)]));
477 ST(&(x[WS(rs, 7)]), VADD(T1i, T1j), ms, &(x[WS(rs, 1)]));
478 }
479 {
480 V T1a, T1b, T1e, T1h;
481 T1a = VSUB(TY, T11);
482 T1b = VBYI(VADD(T18, T15));
483 ST(&(x[WS(rs, 11)]), VSUB(T1a, T1b), ms, &(x[WS(rs, 1)]));
484 ST(&(x[WS(rs, 9)]), VADD(T1a, T1b), ms, &(x[WS(rs, 1)]));
485 T1e = VSUB(T1c, T1d);
486 T1h = VBYI(VSUB(T1f, T1g));
487 ST(&(x[WS(rs, 17)]), VSUB(T1e, T1h), ms, &(x[WS(rs, 1)]));
488 ST(&(x[WS(rs, 3)]), VADD(T1e, T1h), ms, &(x[WS(rs, 1)]));
489 }
490 }
491 }
492 }
493 VLEAVE();
494 }
495
496 static const tw_instr twinstr[] = {
497 VTW(0, 1),
498 VTW(0, 2),
499 VTW(0, 3),
500 VTW(0, 4),
501 VTW(0, 5),
502 VTW(0, 6),
503 VTW(0, 7),
504 VTW(0, 8),
505 VTW(0, 9),
506 VTW(0, 10),
507 VTW(0, 11),
508 VTW(0, 12),
509 VTW(0, 13),
510 VTW(0, 14),
511 VTW(0, 15),
512 VTW(0, 16),
513 VTW(0, 17),
514 VTW(0, 18),
515 VTW(0, 19),
516 {TW_NEXT, VL, 0}
517 };
518
519 static const ct_desc desc = { 20, XSIMD_STRING("t2fv_20"), twinstr, &GENUS, {111, 50, 12, 0}, 0, 0, 0 };
520
521 void XSIMD(codelet_t2fv_20) (planner *p) {
522 X(kdft_dit_register) (p, t2fv_20, &desc);
523 }
524 #endif