comparison src/fftw-3.3.8/dft/simd/common/t2fv_64.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:05:45 EDT 2018 */
23
24 #include "dft/codelet-dft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t2fv_64 -include dft/simd/t2f.h */
29
30 /*
31 * This function contains 519 FP additions, 384 FP multiplications,
32 * (or, 261 additions, 126 multiplications, 258 fused multiply/add),
33 * 107 stack variables, 15 constants, and 128 memory accesses
34 */
35 #include "dft/simd/t2f.h"
36
37 static void t2fv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
40 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
41 DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
42 DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
43 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
44 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
45 DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
46 DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
47 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
48 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
49 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
50 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
51 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
52 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
53 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
54 {
55 INT m;
56 R *x;
57 x = ri;
58 for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
59 V Ta, T3U, T6l, T7B, T37, T3V, T58, T7a, T1v, T24, T43, T4F, T5F, T7l, T5Q;
60 V T7o, T2i, T2R, T4a, T4I, T60, T7s, T6b, T7v, T4h, T4i, T4C, T5x, T7g, T1i;
61 V T3a, T5u, T7h, T4k, T4l, T4B, T5o, T7d, TV, T3b, T5l, T7e, T3X, T3Y, Tx;
62 V T38, T5f, T7C, T6o, T7b, T1S, T25, T5T, T7m, T46, T4G, T5M, T7p, T2F, T2S;
63 V T6e, T7t, T4d, T4J, T67, T7w;
64 {
65 V T1, T3, T8, T6, T33, T35, T55, T2Y, T30, T56, T2, T7, T5;
66 T1 = LD(&(x[0]), ms, &(x[0]));
67 T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
68 T3 = BYTWJ(&(W[TWVL * 62]), T2);
69 T7 = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
70 T8 = BYTWJ(&(W[TWVL * 94]), T7);
71 T5 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
72 T6 = BYTWJ(&(W[TWVL * 30]), T5);
73 {
74 V T32, T34, T2X, T2Z;
75 T32 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
76 T33 = BYTWJ(&(W[TWVL * 14]), T32);
77 T34 = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
78 T35 = BYTWJ(&(W[TWVL * 78]), T34);
79 T55 = VSUB(T33, T35);
80 T2X = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
81 T2Y = BYTWJ(&(W[TWVL * 110]), T2X);
82 T2Z = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
83 T30 = BYTWJ(&(W[TWVL * 46]), T2Z);
84 T56 = VSUB(T2Y, T30);
85 }
86 {
87 V T4, T9, T6j, T6k;
88 T4 = VADD(T1, T3);
89 T9 = VADD(T6, T8);
90 Ta = VSUB(T4, T9);
91 T3U = VADD(T4, T9);
92 T6j = VSUB(T6, T8);
93 T6k = VSUB(T56, T55);
94 T6l = VFNMS(LDK(KP707106781), T6k, T6j);
95 T7B = VFMA(LDK(KP707106781), T6k, T6j);
96 }
97 {
98 V T31, T36, T54, T57;
99 T31 = VADD(T2Y, T30);
100 T36 = VADD(T33, T35);
101 T37 = VSUB(T31, T36);
102 T3V = VADD(T36, T31);
103 T54 = VSUB(T1, T3);
104 T57 = VADD(T55, T56);
105 T58 = VFMA(LDK(KP707106781), T57, T54);
106 T7a = VFNMS(LDK(KP707106781), T57, T54);
107 }
108 }
109 {
110 V T1m, T1o, T1p, T1r, T1t, T1u, T1Y, T5C, T23, T5D, T41, T42;
111 {
112 V T1l, T1n, T1q, T1s;
113 T1l = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
114 T1m = BYTWJ(&(W[0]), T1l);
115 T1n = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
116 T1o = BYTWJ(&(W[TWVL * 64]), T1n);
117 T1p = VADD(T1m, T1o);
118 T1q = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
119 T1r = BYTWJ(&(W[TWVL * 32]), T1q);
120 T1s = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
121 T1t = BYTWJ(&(W[TWVL * 96]), T1s);
122 T1u = VADD(T1r, T1t);
123 }
124 {
125 V T1V, T1X, T1U, T1W;
126 T1U = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
127 T1V = BYTWJ(&(W[TWVL * 16]), T1U);
128 T1W = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
129 T1X = BYTWJ(&(W[TWVL * 80]), T1W);
130 T1Y = VADD(T1V, T1X);
131 T5C = VSUB(T1V, T1X);
132 }
133 {
134 V T20, T22, T1Z, T21;
135 T1Z = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
136 T20 = BYTWJ(&(W[TWVL * 112]), T1Z);
137 T21 = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
138 T22 = BYTWJ(&(W[TWVL * 48]), T21);
139 T23 = VADD(T20, T22);
140 T5D = VSUB(T20, T22);
141 }
142 T1v = VSUB(T1p, T1u);
143 T24 = VSUB(T1Y, T23);
144 T41 = VADD(T1p, T1u);
145 T42 = VADD(T1Y, T23);
146 T43 = VADD(T41, T42);
147 T4F = VSUB(T41, T42);
148 {
149 V T5B, T5E, T5O, T5P;
150 T5B = VSUB(T1m, T1o);
151 T5E = VADD(T5C, T5D);
152 T5F = VFMA(LDK(KP707106781), T5E, T5B);
153 T7l = VFNMS(LDK(KP707106781), T5E, T5B);
154 T5O = VSUB(T1r, T1t);
155 T5P = VSUB(T5C, T5D);
156 T5Q = VFMA(LDK(KP707106781), T5P, T5O);
157 T7o = VFNMS(LDK(KP707106781), T5P, T5O);
158 }
159 }
160 {
161 V T29, T2b, T2c, T2e, T2g, T2h, T2L, T5Y, T2Q, T5X, T48, T49;
162 {
163 V T28, T2a, T2d, T2f;
164 T28 = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
165 T29 = BYTWJ(&(W[TWVL * 124]), T28);
166 T2a = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
167 T2b = BYTWJ(&(W[TWVL * 60]), T2a);
168 T2c = VADD(T29, T2b);
169 T2d = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
170 T2e = BYTWJ(&(W[TWVL * 28]), T2d);
171 T2f = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
172 T2g = BYTWJ(&(W[TWVL * 92]), T2f);
173 T2h = VADD(T2e, T2g);
174 }
175 {
176 V T2I, T2K, T2H, T2J;
177 T2H = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
178 T2I = BYTWJ(&(W[TWVL * 108]), T2H);
179 T2J = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
180 T2K = BYTWJ(&(W[TWVL * 44]), T2J);
181 T2L = VADD(T2I, T2K);
182 T5Y = VSUB(T2I, T2K);
183 }
184 {
185 V T2N, T2P, T2M, T2O;
186 T2M = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
187 T2N = BYTWJ(&(W[TWVL * 12]), T2M);
188 T2O = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
189 T2P = BYTWJ(&(W[TWVL * 76]), T2O);
190 T2Q = VADD(T2N, T2P);
191 T5X = VSUB(T2N, T2P);
192 }
193 T2i = VSUB(T2c, T2h);
194 T2R = VSUB(T2L, T2Q);
195 T48 = VADD(T2c, T2h);
196 T49 = VADD(T2Q, T2L);
197 T4a = VADD(T48, T49);
198 T4I = VSUB(T48, T49);
199 {
200 V T5W, T5Z, T69, T6a;
201 T5W = VSUB(T29, T2b);
202 T5Z = VADD(T5X, T5Y);
203 T60 = VFMA(LDK(KP707106781), T5Z, T5W);
204 T7s = VFNMS(LDK(KP707106781), T5Z, T5W);
205 T69 = VSUB(T2g, T2e);
206 T6a = VSUB(T5Y, T5X);
207 T6b = VFMA(LDK(KP707106781), T6a, T69);
208 T7v = VFNMS(LDK(KP707106781), T6a, T69);
209 }
210 }
211 {
212 V TX, TZ, T10, T12, T14, T15, T1b, T5s, T1g, T5r, T5v, T5w;
213 {
214 V TW, TY, T11, T13;
215 TW = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
216 TX = BYTWJ(&(W[TWVL * 122]), TW);
217 TY = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
218 TZ = BYTWJ(&(W[TWVL * 58]), TY);
219 T10 = VADD(TX, TZ);
220 T11 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
221 T12 = BYTWJ(&(W[TWVL * 26]), T11);
222 T13 = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
223 T14 = BYTWJ(&(W[TWVL * 90]), T13);
224 T15 = VADD(T12, T14);
225 }
226 {
227 V T18, T1a, T17, T19;
228 T17 = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
229 T18 = BYTWJ(&(W[TWVL * 106]), T17);
230 T19 = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
231 T1a = BYTWJ(&(W[TWVL * 42]), T19);
232 T1b = VADD(T18, T1a);
233 T5s = VSUB(T18, T1a);
234 }
235 {
236 V T1d, T1f, T1c, T1e;
237 T1c = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
238 T1d = BYTWJ(&(W[TWVL * 10]), T1c);
239 T1e = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
240 T1f = BYTWJ(&(W[TWVL * 74]), T1e);
241 T1g = VADD(T1d, T1f);
242 T5r = VSUB(T1d, T1f);
243 }
244 T4h = VADD(T10, T15);
245 T4i = VADD(T1g, T1b);
246 T4C = VSUB(T4h, T4i);
247 T5v = VSUB(T14, T12);
248 T5w = VSUB(T5s, T5r);
249 T5x = VFMA(LDK(KP707106781), T5w, T5v);
250 T7g = VFNMS(LDK(KP707106781), T5w, T5v);
251 {
252 V T16, T1h, T5q, T5t;
253 T16 = VSUB(T10, T15);
254 T1h = VSUB(T1b, T1g);
255 T1i = VFNMS(LDK(KP414213562), T1h, T16);
256 T3a = VFMA(LDK(KP414213562), T16, T1h);
257 T5q = VSUB(TX, TZ);
258 T5t = VADD(T5r, T5s);
259 T5u = VFMA(LDK(KP707106781), T5t, T5q);
260 T7h = VFNMS(LDK(KP707106781), T5t, T5q);
261 }
262 }
263 {
264 V TA, TC, TD, TF, TH, TI, TO, T5i, TT, T5j, T5m, T5n;
265 {
266 V Tz, TB, TE, TG;
267 Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
268 TA = BYTWJ(&(W[TWVL * 2]), Tz);
269 TB = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
270 TC = BYTWJ(&(W[TWVL * 66]), TB);
271 TD = VADD(TA, TC);
272 TE = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
273 TF = BYTWJ(&(W[TWVL * 34]), TE);
274 TG = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
275 TH = BYTWJ(&(W[TWVL * 98]), TG);
276 TI = VADD(TF, TH);
277 }
278 {
279 V TL, TN, TK, TM;
280 TK = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
281 TL = BYTWJ(&(W[TWVL * 18]), TK);
282 TM = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
283 TN = BYTWJ(&(W[TWVL * 82]), TM);
284 TO = VADD(TL, TN);
285 T5i = VSUB(TL, TN);
286 }
287 {
288 V TQ, TS, TP, TR;
289 TP = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
290 TQ = BYTWJ(&(W[TWVL * 114]), TP);
291 TR = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
292 TS = BYTWJ(&(W[TWVL * 50]), TR);
293 TT = VADD(TQ, TS);
294 T5j = VSUB(TQ, TS);
295 }
296 T4k = VADD(TD, TI);
297 T4l = VADD(TO, TT);
298 T4B = VSUB(T4k, T4l);
299 T5m = VSUB(TF, TH);
300 T5n = VSUB(T5i, T5j);
301 T5o = VFMA(LDK(KP707106781), T5n, T5m);
302 T7d = VFNMS(LDK(KP707106781), T5n, T5m);
303 {
304 V TJ, TU, T5h, T5k;
305 TJ = VSUB(TD, TI);
306 TU = VSUB(TO, TT);
307 TV = VFNMS(LDK(KP414213562), TU, TJ);
308 T3b = VFMA(LDK(KP414213562), TJ, TU);
309 T5h = VSUB(TA, TC);
310 T5k = VADD(T5i, T5j);
311 T5l = VFMA(LDK(KP707106781), T5k, T5h);
312 T7e = VFNMS(LDK(KP707106781), T5k, T5h);
313 }
314 }
315 {
316 V Tf, T59, Tv, T5d, Tk, T5a, Tq, T5c, Tl, Tw;
317 {
318 V Tc, Te, Tb, Td;
319 Tb = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
320 Tc = BYTWJ(&(W[TWVL * 6]), Tb);
321 Td = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
322 Te = BYTWJ(&(W[TWVL * 70]), Td);
323 Tf = VADD(Tc, Te);
324 T59 = VSUB(Tc, Te);
325 }
326 {
327 V Ts, Tu, Tr, Tt;
328 Tr = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
329 Ts = BYTWJ(&(W[TWVL * 22]), Tr);
330 Tt = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
331 Tu = BYTWJ(&(W[TWVL * 86]), Tt);
332 Tv = VADD(Ts, Tu);
333 T5d = VSUB(Tu, Ts);
334 }
335 {
336 V Th, Tj, Tg, Ti;
337 Tg = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
338 Th = BYTWJ(&(W[TWVL * 38]), Tg);
339 Ti = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
340 Tj = BYTWJ(&(W[TWVL * 102]), Ti);
341 Tk = VADD(Th, Tj);
342 T5a = VSUB(Th, Tj);
343 }
344 {
345 V Tn, Tp, Tm, To;
346 Tm = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
347 Tn = BYTWJ(&(W[TWVL * 118]), Tm);
348 To = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
349 Tp = BYTWJ(&(W[TWVL * 54]), To);
350 Tq = VADD(Tn, Tp);
351 T5c = VSUB(Tn, Tp);
352 }
353 T3X = VADD(Tf, Tk);
354 T3Y = VADD(Tq, Tv);
355 Tl = VSUB(Tf, Tk);
356 Tw = VSUB(Tq, Tv);
357 Tx = VADD(Tl, Tw);
358 T38 = VSUB(Tw, Tl);
359 {
360 V T5b, T5e, T6m, T6n;
361 T5b = VFNMS(LDK(KP414213562), T5a, T59);
362 T5e = VFNMS(LDK(KP414213562), T5d, T5c);
363 T5f = VADD(T5b, T5e);
364 T7C = VSUB(T5e, T5b);
365 T6m = VFMA(LDK(KP414213562), T59, T5a);
366 T6n = VFMA(LDK(KP414213562), T5c, T5d);
367 T6o = VSUB(T6m, T6n);
368 T7b = VADD(T6m, T6n);
369 }
370 }
371 {
372 V T1A, T5G, T1Q, T5K, T1F, T5H, T1L, T5J;
373 {
374 V T1x, T1z, T1w, T1y;
375 T1w = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
376 T1x = BYTWJ(&(W[TWVL * 8]), T1w);
377 T1y = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
378 T1z = BYTWJ(&(W[TWVL * 72]), T1y);
379 T1A = VADD(T1x, T1z);
380 T5G = VSUB(T1x, T1z);
381 }
382 {
383 V T1N, T1P, T1M, T1O;
384 T1M = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
385 T1N = BYTWJ(&(W[TWVL * 24]), T1M);
386 T1O = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
387 T1P = BYTWJ(&(W[TWVL * 88]), T1O);
388 T1Q = VADD(T1N, T1P);
389 T5K = VSUB(T1N, T1P);
390 }
391 {
392 V T1C, T1E, T1B, T1D;
393 T1B = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
394 T1C = BYTWJ(&(W[TWVL * 40]), T1B);
395 T1D = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
396 T1E = BYTWJ(&(W[TWVL * 104]), T1D);
397 T1F = VADD(T1C, T1E);
398 T5H = VSUB(T1C, T1E);
399 }
400 {
401 V T1I, T1K, T1H, T1J;
402 T1H = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
403 T1I = BYTWJ(&(W[TWVL * 120]), T1H);
404 T1J = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
405 T1K = BYTWJ(&(W[TWVL * 56]), T1J);
406 T1L = VADD(T1I, T1K);
407 T5J = VSUB(T1I, T1K);
408 }
409 {
410 V T1G, T1R, T5R, T5S;
411 T1G = VSUB(T1A, T1F);
412 T1R = VSUB(T1L, T1Q);
413 T1S = VADD(T1G, T1R);
414 T25 = VSUB(T1G, T1R);
415 T5R = VFMA(LDK(KP414213562), T5G, T5H);
416 T5S = VFNMS(LDK(KP414213562), T5J, T5K);
417 T5T = VADD(T5R, T5S);
418 T7m = VSUB(T5R, T5S);
419 }
420 {
421 V T44, T45, T5I, T5L;
422 T44 = VADD(T1A, T1F);
423 T45 = VADD(T1L, T1Q);
424 T46 = VADD(T44, T45);
425 T4G = VSUB(T44, T45);
426 T5I = VFNMS(LDK(KP414213562), T5H, T5G);
427 T5L = VFMA(LDK(KP414213562), T5K, T5J);
428 T5M = VADD(T5I, T5L);
429 T7p = VSUB(T5I, T5L);
430 }
431 }
432 {
433 V T2n, T61, T2D, T65, T2s, T62, T2y, T64;
434 {
435 V T2k, T2m, T2j, T2l;
436 T2j = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
437 T2k = BYTWJ(&(W[TWVL * 4]), T2j);
438 T2l = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
439 T2m = BYTWJ(&(W[TWVL * 68]), T2l);
440 T2n = VADD(T2k, T2m);
441 T61 = VSUB(T2k, T2m);
442 }
443 {
444 V T2A, T2C, T2z, T2B;
445 T2z = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
446 T2A = BYTWJ(&(W[TWVL * 20]), T2z);
447 T2B = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
448 T2C = BYTWJ(&(W[TWVL * 84]), T2B);
449 T2D = VADD(T2A, T2C);
450 T65 = VSUB(T2C, T2A);
451 }
452 {
453 V T2p, T2r, T2o, T2q;
454 T2o = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
455 T2p = BYTWJ(&(W[TWVL * 36]), T2o);
456 T2q = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
457 T2r = BYTWJ(&(W[TWVL * 100]), T2q);
458 T2s = VADD(T2p, T2r);
459 T62 = VSUB(T2r, T2p);
460 }
461 {
462 V T2v, T2x, T2u, T2w;
463 T2u = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
464 T2v = BYTWJ(&(W[TWVL * 116]), T2u);
465 T2w = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
466 T2x = BYTWJ(&(W[TWVL * 52]), T2w);
467 T2y = VADD(T2v, T2x);
468 T64 = VSUB(T2v, T2x);
469 }
470 {
471 V T2t, T2E, T6c, T6d;
472 T2t = VSUB(T2n, T2s);
473 T2E = VSUB(T2y, T2D);
474 T2F = VADD(T2t, T2E);
475 T2S = VSUB(T2E, T2t);
476 T6c = VFNMS(LDK(KP414213562), T61, T62);
477 T6d = VFMA(LDK(KP414213562), T64, T65);
478 T6e = VADD(T6c, T6d);
479 T7t = VSUB(T6d, T6c);
480 }
481 {
482 V T4b, T4c, T63, T66;
483 T4b = VADD(T2n, T2s);
484 T4c = VADD(T2y, T2D);
485 T4d = VADD(T4b, T4c);
486 T4J = VSUB(T4c, T4b);
487 T63 = VFMA(LDK(KP414213562), T62, T61);
488 T66 = VFNMS(LDK(KP414213562), T65, T64);
489 T67 = VADD(T63, T66);
490 T7w = VSUB(T66, T63);
491 }
492 }
493 {
494 V T40, T4s, T4x, T4z, T4f, T4o, T4n, T4t, T4u, T4y;
495 {
496 V T3W, T3Z, T4v, T4w;
497 T3W = VADD(T3U, T3V);
498 T3Z = VADD(T3X, T3Y);
499 T40 = VSUB(T3W, T3Z);
500 T4s = VADD(T3W, T3Z);
501 T4v = VADD(T43, T46);
502 T4w = VADD(T4a, T4d);
503 T4x = VADD(T4v, T4w);
504 T4z = VSUB(T4w, T4v);
505 }
506 {
507 V T47, T4e, T4j, T4m;
508 T47 = VSUB(T43, T46);
509 T4e = VSUB(T4a, T4d);
510 T4f = VADD(T47, T4e);
511 T4o = VSUB(T4e, T47);
512 T4j = VADD(T4h, T4i);
513 T4m = VADD(T4k, T4l);
514 T4n = VSUB(T4j, T4m);
515 T4t = VADD(T4m, T4j);
516 }
517 T4u = VADD(T4s, T4t);
518 ST(&(x[WS(rs, 32)]), VSUB(T4u, T4x), ms, &(x[0]));
519 ST(&(x[0]), VADD(T4u, T4x), ms, &(x[0]));
520 T4y = VSUB(T4s, T4t);
521 ST(&(x[WS(rs, 48)]), VFNMSI(T4z, T4y), ms, &(x[0]));
522 ST(&(x[WS(rs, 16)]), VFMAI(T4z, T4y), ms, &(x[0]));
523 {
524 V T4g, T4p, T4q, T4r;
525 T4g = VFNMS(LDK(KP707106781), T4f, T40);
526 T4p = VFNMS(LDK(KP707106781), T4o, T4n);
527 ST(&(x[WS(rs, 24)]), VFNMSI(T4p, T4g), ms, &(x[0]));
528 ST(&(x[WS(rs, 40)]), VFMAI(T4p, T4g), ms, &(x[0]));
529 T4q = VFMA(LDK(KP707106781), T4f, T40);
530 T4r = VFMA(LDK(KP707106781), T4o, T4n);
531 ST(&(x[WS(rs, 56)]), VFNMSI(T4r, T4q), ms, &(x[0]));
532 ST(&(x[WS(rs, 8)]), VFMAI(T4r, T4q), ms, &(x[0]));
533 }
534 }
535 {
536 V T4E, T4W, T4S, T4X, T4L, T50, T4P, T4Z;
537 {
538 V T4A, T4D, T4Q, T4R;
539 T4A = VSUB(T3U, T3V);
540 T4D = VADD(T4B, T4C);
541 T4E = VFMA(LDK(KP707106781), T4D, T4A);
542 T4W = VFNMS(LDK(KP707106781), T4D, T4A);
543 T4Q = VFMA(LDK(KP414213562), T4I, T4J);
544 T4R = VFMA(LDK(KP414213562), T4F, T4G);
545 T4S = VSUB(T4Q, T4R);
546 T4X = VADD(T4R, T4Q);
547 }
548 {
549 V T4H, T4K, T4N, T4O;
550 T4H = VFNMS(LDK(KP414213562), T4G, T4F);
551 T4K = VFNMS(LDK(KP414213562), T4J, T4I);
552 T4L = VADD(T4H, T4K);
553 T50 = VSUB(T4K, T4H);
554 T4N = VSUB(T3Y, T3X);
555 T4O = VSUB(T4C, T4B);
556 T4P = VFMA(LDK(KP707106781), T4O, T4N);
557 T4Z = VFNMS(LDK(KP707106781), T4O, T4N);
558 }
559 {
560 V T4M, T4T, T52, T53;
561 T4M = VFNMS(LDK(KP923879532), T4L, T4E);
562 T4T = VFNMS(LDK(KP923879532), T4S, T4P);
563 ST(&(x[WS(rs, 28)]), VFNMSI(T4T, T4M), ms, &(x[0]));
564 ST(&(x[WS(rs, 36)]), VFMAI(T4T, T4M), ms, &(x[0]));
565 T52 = VFMA(LDK(KP923879532), T4X, T4W);
566 T53 = VFNMS(LDK(KP923879532), T50, T4Z);
567 ST(&(x[WS(rs, 12)]), VFNMSI(T53, T52), ms, &(x[0]));
568 ST(&(x[WS(rs, 52)]), VFMAI(T53, T52), ms, &(x[0]));
569 }
570 {
571 V T4U, T4V, T4Y, T51;
572 T4U = VFMA(LDK(KP923879532), T4L, T4E);
573 T4V = VFMA(LDK(KP923879532), T4S, T4P);
574 ST(&(x[WS(rs, 60)]), VFNMSI(T4V, T4U), ms, &(x[0]));
575 ST(&(x[WS(rs, 4)]), VFMAI(T4V, T4U), ms, &(x[0]));
576 T4Y = VFNMS(LDK(KP923879532), T4X, T4W);
577 T51 = VFMA(LDK(KP923879532), T50, T4Z);
578 ST(&(x[WS(rs, 20)]), VFMAI(T51, T4Y), ms, &(x[0]));
579 ST(&(x[WS(rs, 44)]), VFNMSI(T51, T4Y), ms, &(x[0]));
580 }
581 }
582 {
583 V T1k, T3k, T3d, T3n, T2V, T3o, T3g, T3l;
584 {
585 V Ty, T1j, T39, T3c;
586 Ty = VFMA(LDK(KP707106781), Tx, Ta);
587 T1j = VADD(TV, T1i);
588 T1k = VFMA(LDK(KP923879532), T1j, Ty);
589 T3k = VFNMS(LDK(KP923879532), T1j, Ty);
590 T39 = VFMA(LDK(KP707106781), T38, T37);
591 T3c = VSUB(T3a, T3b);
592 T3d = VFMA(LDK(KP923879532), T3c, T39);
593 T3n = VFNMS(LDK(KP923879532), T3c, T39);
594 {
595 V T27, T3f, T2U, T3e;
596 {
597 V T1T, T26, T2G, T2T;
598 T1T = VFMA(LDK(KP707106781), T1S, T1v);
599 T26 = VFMA(LDK(KP707106781), T25, T24);
600 T27 = VFNMS(LDK(KP198912367), T26, T1T);
601 T3f = VFMA(LDK(KP198912367), T1T, T26);
602 T2G = VFMA(LDK(KP707106781), T2F, T2i);
603 T2T = VFMA(LDK(KP707106781), T2S, T2R);
604 T2U = VFNMS(LDK(KP198912367), T2T, T2G);
605 T3e = VFMA(LDK(KP198912367), T2G, T2T);
606 }
607 T2V = VADD(T27, T2U);
608 T3o = VSUB(T2U, T27);
609 T3g = VSUB(T3e, T3f);
610 T3l = VADD(T3f, T3e);
611 }
612 }
613 {
614 V T2W, T3h, T3q, T3r;
615 T2W = VFNMS(LDK(KP980785280), T2V, T1k);
616 T3h = VFNMS(LDK(KP980785280), T3g, T3d);
617 ST(&(x[WS(rs, 30)]), VFNMSI(T3h, T2W), ms, &(x[0]));
618 ST(&(x[WS(rs, 34)]), VFMAI(T3h, T2W), ms, &(x[0]));
619 T3q = VFMA(LDK(KP980785280), T3l, T3k);
620 T3r = VFNMS(LDK(KP980785280), T3o, T3n);
621 ST(&(x[WS(rs, 14)]), VFNMSI(T3r, T3q), ms, &(x[0]));
622 ST(&(x[WS(rs, 50)]), VFMAI(T3r, T3q), ms, &(x[0]));
623 }
624 {
625 V T3i, T3j, T3m, T3p;
626 T3i = VFMA(LDK(KP980785280), T2V, T1k);
627 T3j = VFMA(LDK(KP980785280), T3g, T3d);
628 ST(&(x[WS(rs, 62)]), VFNMSI(T3j, T3i), ms, &(x[0]));
629 ST(&(x[WS(rs, 2)]), VFMAI(T3j, T3i), ms, &(x[0]));
630 T3m = VFNMS(LDK(KP980785280), T3l, T3k);
631 T3p = VFMA(LDK(KP980785280), T3o, T3n);
632 ST(&(x[WS(rs, 18)]), VFMAI(T3p, T3m), ms, &(x[0]));
633 ST(&(x[WS(rs, 46)]), VFNMSI(T3p, T3m), ms, &(x[0]));
634 }
635 }
636 {
637 V T3u, T3M, T3F, T3P, T3B, T3Q, T3I, T3N;
638 {
639 V T3s, T3t, T3D, T3E;
640 T3s = VFNMS(LDK(KP707106781), Tx, Ta);
641 T3t = VADD(T3b, T3a);
642 T3u = VFMA(LDK(KP923879532), T3t, T3s);
643 T3M = VFNMS(LDK(KP923879532), T3t, T3s);
644 T3D = VFNMS(LDK(KP707106781), T38, T37);
645 T3E = VSUB(T1i, TV);
646 T3F = VFNMS(LDK(KP923879532), T3E, T3D);
647 T3P = VFMA(LDK(KP923879532), T3E, T3D);
648 {
649 V T3x, T3H, T3A, T3G;
650 {
651 V T3v, T3w, T3y, T3z;
652 T3v = VFNMS(LDK(KP707106781), T1S, T1v);
653 T3w = VFNMS(LDK(KP707106781), T25, T24);
654 T3x = VFMA(LDK(KP668178637), T3w, T3v);
655 T3H = VFNMS(LDK(KP668178637), T3v, T3w);
656 T3y = VFNMS(LDK(KP707106781), T2F, T2i);
657 T3z = VFNMS(LDK(KP707106781), T2S, T2R);
658 T3A = VFMA(LDK(KP668178637), T3z, T3y);
659 T3G = VFNMS(LDK(KP668178637), T3y, T3z);
660 }
661 T3B = VADD(T3x, T3A);
662 T3Q = VSUB(T3A, T3x);
663 T3I = VSUB(T3G, T3H);
664 T3N = VADD(T3H, T3G);
665 }
666 }
667 {
668 V T3C, T3J, T3S, T3T;
669 T3C = VFNMS(LDK(KP831469612), T3B, T3u);
670 T3J = VFNMS(LDK(KP831469612), T3I, T3F);
671 ST(&(x[WS(rs, 38)]), VFNMSI(T3J, T3C), ms, &(x[0]));
672 ST(&(x[WS(rs, 26)]), VFMAI(T3J, T3C), ms, &(x[0]));
673 T3S = VFNMS(LDK(KP831469612), T3N, T3M);
674 T3T = VFMA(LDK(KP831469612), T3Q, T3P);
675 ST(&(x[WS(rs, 10)]), VFMAI(T3T, T3S), ms, &(x[0]));
676 ST(&(x[WS(rs, 54)]), VFNMSI(T3T, T3S), ms, &(x[0]));
677 }
678 {
679 V T3K, T3L, T3O, T3R;
680 T3K = VFMA(LDK(KP831469612), T3B, T3u);
681 T3L = VFMA(LDK(KP831469612), T3I, T3F);
682 ST(&(x[WS(rs, 6)]), VFNMSI(T3L, T3K), ms, &(x[0]));
683 ST(&(x[WS(rs, 58)]), VFMAI(T3L, T3K), ms, &(x[0]));
684 T3O = VFMA(LDK(KP831469612), T3N, T3M);
685 T3R = VFNMS(LDK(KP831469612), T3Q, T3P);
686 ST(&(x[WS(rs, 22)]), VFNMSI(T3R, T3O), ms, &(x[0]));
687 ST(&(x[WS(rs, 42)]), VFMAI(T3R, T3O), ms, &(x[0]));
688 }
689 }
690 {
691 V T7k, T8j, T7O, T89, T7H, T8g, T7R, T7Y, T7z, T7S, T7K, T7P, T85, T8k, T8c;
692 V T8h;
693 {
694 V T7c, T87, T7j, T88, T7f, T7i;
695 T7c = VFNMS(LDK(KP923879532), T7b, T7a);
696 T87 = VFMA(LDK(KP923879532), T7C, T7B);
697 T7f = VFNMS(LDK(KP668178637), T7e, T7d);
698 T7i = VFNMS(LDK(KP668178637), T7h, T7g);
699 T7j = VADD(T7f, T7i);
700 T88 = VSUB(T7f, T7i);
701 T7k = VFNMS(LDK(KP831469612), T7j, T7c);
702 T8j = VFNMS(LDK(KP831469612), T88, T87);
703 T7O = VFMA(LDK(KP831469612), T7j, T7c);
704 T89 = VFMA(LDK(KP831469612), T88, T87);
705 }
706 {
707 V T7D, T7W, T7G, T7X, T7E, T7F;
708 T7D = VFNMS(LDK(KP923879532), T7C, T7B);
709 T7W = VFMA(LDK(KP923879532), T7b, T7a);
710 T7E = VFMA(LDK(KP668178637), T7g, T7h);
711 T7F = VFMA(LDK(KP668178637), T7d, T7e);
712 T7G = VSUB(T7E, T7F);
713 T7X = VADD(T7F, T7E);
714 T7H = VFNMS(LDK(KP831469612), T7G, T7D);
715 T8g = VFNMS(LDK(KP831469612), T7X, T7W);
716 T7R = VFMA(LDK(KP831469612), T7G, T7D);
717 T7Y = VFMA(LDK(KP831469612), T7X, T7W);
718 }
719 {
720 V T7r, T7I, T7y, T7J;
721 {
722 V T7n, T7q, T7u, T7x;
723 T7n = VFNMS(LDK(KP923879532), T7m, T7l);
724 T7q = VFMA(LDK(KP923879532), T7p, T7o);
725 T7r = VFNMS(LDK(KP534511135), T7q, T7n);
726 T7I = VFMA(LDK(KP534511135), T7n, T7q);
727 T7u = VFNMS(LDK(KP923879532), T7t, T7s);
728 T7x = VFMA(LDK(KP923879532), T7w, T7v);
729 T7y = VFNMS(LDK(KP534511135), T7x, T7u);
730 T7J = VFMA(LDK(KP534511135), T7u, T7x);
731 }
732 T7z = VADD(T7r, T7y);
733 T7S = VSUB(T7y, T7r);
734 T7K = VSUB(T7I, T7J);
735 T7P = VADD(T7I, T7J);
736 }
737 {
738 V T81, T8a, T84, T8b;
739 {
740 V T7Z, T80, T82, T83;
741 T7Z = VFMA(LDK(KP923879532), T7m, T7l);
742 T80 = VFNMS(LDK(KP923879532), T7p, T7o);
743 T81 = VFMA(LDK(KP303346683), T80, T7Z);
744 T8a = VFNMS(LDK(KP303346683), T7Z, T80);
745 T82 = VFMA(LDK(KP923879532), T7t, T7s);
746 T83 = VFNMS(LDK(KP923879532), T7w, T7v);
747 T84 = VFMA(LDK(KP303346683), T83, T82);
748 T8b = VFNMS(LDK(KP303346683), T82, T83);
749 }
750 T85 = VADD(T81, T84);
751 T8k = VSUB(T84, T81);
752 T8c = VSUB(T8a, T8b);
753 T8h = VADD(T8a, T8b);
754 }
755 {
756 V T7A, T7L, T8i, T8l;
757 T7A = VFNMS(LDK(KP881921264), T7z, T7k);
758 T7L = VFNMS(LDK(KP881921264), T7K, T7H);
759 ST(&(x[WS(rs, 37)]), VFNMSI(T7L, T7A), ms, &(x[WS(rs, 1)]));
760 ST(&(x[WS(rs, 27)]), VFMAI(T7L, T7A), ms, &(x[WS(rs, 1)]));
761 T8i = VFMA(LDK(KP956940335), T8h, T8g);
762 T8l = VFMA(LDK(KP956940335), T8k, T8j);
763 ST(&(x[WS(rs, 19)]), VFMAI(T8l, T8i), ms, &(x[WS(rs, 1)]));
764 ST(&(x[WS(rs, 45)]), VFNMSI(T8l, T8i), ms, &(x[WS(rs, 1)]));
765 }
766 {
767 V T8m, T8n, T7M, T7N;
768 T8m = VFNMS(LDK(KP956940335), T8h, T8g);
769 T8n = VFNMS(LDK(KP956940335), T8k, T8j);
770 ST(&(x[WS(rs, 13)]), VFNMSI(T8n, T8m), ms, &(x[WS(rs, 1)]));
771 ST(&(x[WS(rs, 51)]), VFMAI(T8n, T8m), ms, &(x[WS(rs, 1)]));
772 T7M = VFMA(LDK(KP881921264), T7z, T7k);
773 T7N = VFMA(LDK(KP881921264), T7K, T7H);
774 ST(&(x[WS(rs, 5)]), VFNMSI(T7N, T7M), ms, &(x[WS(rs, 1)]));
775 ST(&(x[WS(rs, 59)]), VFMAI(T7N, T7M), ms, &(x[WS(rs, 1)]));
776 }
777 {
778 V T7Q, T7T, T86, T8d;
779 T7Q = VFNMS(LDK(KP881921264), T7P, T7O);
780 T7T = VFNMS(LDK(KP881921264), T7S, T7R);
781 ST(&(x[WS(rs, 21)]), VFNMSI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
782 ST(&(x[WS(rs, 43)]), VFMAI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
783 T86 = VFNMS(LDK(KP956940335), T85, T7Y);
784 T8d = VFNMS(LDK(KP956940335), T8c, T89);
785 ST(&(x[WS(rs, 29)]), VFNMSI(T8d, T86), ms, &(x[WS(rs, 1)]));
786 ST(&(x[WS(rs, 35)]), VFMAI(T8d, T86), ms, &(x[WS(rs, 1)]));
787 }
788 {
789 V T8e, T8f, T7U, T7V;
790 T8e = VFMA(LDK(KP956940335), T85, T7Y);
791 T8f = VFMA(LDK(KP956940335), T8c, T89);
792 ST(&(x[WS(rs, 61)]), VFNMSI(T8f, T8e), ms, &(x[WS(rs, 1)]));
793 ST(&(x[WS(rs, 3)]), VFMAI(T8f, T8e), ms, &(x[WS(rs, 1)]));
794 T7U = VFMA(LDK(KP881921264), T7P, T7O);
795 T7V = VFMA(LDK(KP881921264), T7S, T7R);
796 ST(&(x[WS(rs, 11)]), VFMAI(T7V, T7U), ms, &(x[WS(rs, 1)]));
797 ST(&(x[WS(rs, 53)]), VFNMSI(T7V, T7U), ms, &(x[WS(rs, 1)]));
798 }
799 }
800 {
801 V T5A, T75, T6A, T6V, T6t, T72, T6D, T6K, T6h, T6E, T6w, T6B, T6R, T76, T6Y;
802 V T73;
803 {
804 V T5g, T6T, T5z, T6U, T5p, T5y;
805 T5g = VFMA(LDK(KP923879532), T5f, T58);
806 T6T = VFNMS(LDK(KP923879532), T6o, T6l);
807 T5p = VFNMS(LDK(KP198912367), T5o, T5l);
808 T5y = VFNMS(LDK(KP198912367), T5x, T5u);
809 T5z = VADD(T5p, T5y);
810 T6U = VSUB(T5y, T5p);
811 T5A = VFMA(LDK(KP980785280), T5z, T5g);
812 T75 = VFNMS(LDK(KP980785280), T6U, T6T);
813 T6A = VFNMS(LDK(KP980785280), T5z, T5g);
814 T6V = VFMA(LDK(KP980785280), T6U, T6T);
815 }
816 {
817 V T6p, T6I, T6s, T6J, T6q, T6r;
818 T6p = VFMA(LDK(KP923879532), T6o, T6l);
819 T6I = VFNMS(LDK(KP923879532), T5f, T58);
820 T6q = VFMA(LDK(KP198912367), T5l, T5o);
821 T6r = VFMA(LDK(KP198912367), T5u, T5x);
822 T6s = VSUB(T6q, T6r);
823 T6J = VADD(T6q, T6r);
824 T6t = VFMA(LDK(KP980785280), T6s, T6p);
825 T72 = VFNMS(LDK(KP980785280), T6J, T6I);
826 T6D = VFNMS(LDK(KP980785280), T6s, T6p);
827 T6K = VFMA(LDK(KP980785280), T6J, T6I);
828 }
829 {
830 V T5V, T6u, T6g, T6v;
831 {
832 V T5N, T5U, T68, T6f;
833 T5N = VFMA(LDK(KP923879532), T5M, T5F);
834 T5U = VFMA(LDK(KP923879532), T5T, T5Q);
835 T5V = VFNMS(LDK(KP098491403), T5U, T5N);
836 T6u = VFMA(LDK(KP098491403), T5N, T5U);
837 T68 = VFMA(LDK(KP923879532), T67, T60);
838 T6f = VFMA(LDK(KP923879532), T6e, T6b);
839 T6g = VFNMS(LDK(KP098491403), T6f, T68);
840 T6v = VFMA(LDK(KP098491403), T68, T6f);
841 }
842 T6h = VADD(T5V, T6g);
843 T6E = VSUB(T6g, T5V);
844 T6w = VSUB(T6u, T6v);
845 T6B = VADD(T6u, T6v);
846 }
847 {
848 V T6N, T6W, T6Q, T6X;
849 {
850 V T6L, T6M, T6O, T6P;
851 T6L = VFNMS(LDK(KP923879532), T5M, T5F);
852 T6M = VFNMS(LDK(KP923879532), T5T, T5Q);
853 T6N = VFMA(LDK(KP820678790), T6M, T6L);
854 T6W = VFNMS(LDK(KP820678790), T6L, T6M);
855 T6O = VFNMS(LDK(KP923879532), T67, T60);
856 T6P = VFNMS(LDK(KP923879532), T6e, T6b);
857 T6Q = VFMA(LDK(KP820678790), T6P, T6O);
858 T6X = VFNMS(LDK(KP820678790), T6O, T6P);
859 }
860 T6R = VADD(T6N, T6Q);
861 T76 = VSUB(T6Q, T6N);
862 T6Y = VSUB(T6W, T6X);
863 T73 = VADD(T6W, T6X);
864 }
865 {
866 V T6i, T6x, T74, T77;
867 T6i = VFNMS(LDK(KP995184726), T6h, T5A);
868 T6x = VFNMS(LDK(KP995184726), T6w, T6t);
869 ST(&(x[WS(rs, 33)]), VFNMSI(T6x, T6i), ms, &(x[WS(rs, 1)]));
870 ST(&(x[WS(rs, 31)]), VFMAI(T6x, T6i), ms, &(x[WS(rs, 1)]));
871 T74 = VFMA(LDK(KP773010453), T73, T72);
872 T77 = VFMA(LDK(KP773010453), T76, T75);
873 ST(&(x[WS(rs, 23)]), VFMAI(T77, T74), ms, &(x[WS(rs, 1)]));
874 ST(&(x[WS(rs, 41)]), VFNMSI(T77, T74), ms, &(x[WS(rs, 1)]));
875 }
876 {
877 V T78, T79, T6y, T6z;
878 T78 = VFNMS(LDK(KP773010453), T73, T72);
879 T79 = VFNMS(LDK(KP773010453), T76, T75);
880 ST(&(x[WS(rs, 9)]), VFNMSI(T79, T78), ms, &(x[WS(rs, 1)]));
881 ST(&(x[WS(rs, 55)]), VFMAI(T79, T78), ms, &(x[WS(rs, 1)]));
882 T6y = VFMA(LDK(KP995184726), T6h, T5A);
883 T6z = VFMA(LDK(KP995184726), T6w, T6t);
884 ST(&(x[WS(rs, 1)]), VFNMSI(T6z, T6y), ms, &(x[WS(rs, 1)]));
885 ST(&(x[WS(rs, 63)]), VFMAI(T6z, T6y), ms, &(x[WS(rs, 1)]));
886 }
887 {
888 V T6C, T6F, T6S, T6Z;
889 T6C = VFNMS(LDK(KP995184726), T6B, T6A);
890 T6F = VFNMS(LDK(KP995184726), T6E, T6D);
891 ST(&(x[WS(rs, 17)]), VFNMSI(T6F, T6C), ms, &(x[WS(rs, 1)]));
892 ST(&(x[WS(rs, 47)]), VFMAI(T6F, T6C), ms, &(x[WS(rs, 1)]));
893 T6S = VFNMS(LDK(KP773010453), T6R, T6K);
894 T6Z = VFNMS(LDK(KP773010453), T6Y, T6V);
895 ST(&(x[WS(rs, 25)]), VFNMSI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
896 ST(&(x[WS(rs, 39)]), VFMAI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
897 }
898 {
899 V T70, T71, T6G, T6H;
900 T70 = VFMA(LDK(KP773010453), T6R, T6K);
901 T71 = VFMA(LDK(KP773010453), T6Y, T6V);
902 ST(&(x[WS(rs, 57)]), VFNMSI(T71, T70), ms, &(x[WS(rs, 1)]));
903 ST(&(x[WS(rs, 7)]), VFMAI(T71, T70), ms, &(x[WS(rs, 1)]));
904 T6G = VFMA(LDK(KP995184726), T6B, T6A);
905 T6H = VFMA(LDK(KP995184726), T6E, T6D);
906 ST(&(x[WS(rs, 15)]), VFMAI(T6H, T6G), ms, &(x[WS(rs, 1)]));
907 ST(&(x[WS(rs, 49)]), VFNMSI(T6H, T6G), ms, &(x[WS(rs, 1)]));
908 }
909 }
910 }
911 }
912 VLEAVE();
913 }
914
915 static const tw_instr twinstr[] = {
916 VTW(0, 1),
917 VTW(0, 2),
918 VTW(0, 3),
919 VTW(0, 4),
920 VTW(0, 5),
921 VTW(0, 6),
922 VTW(0, 7),
923 VTW(0, 8),
924 VTW(0, 9),
925 VTW(0, 10),
926 VTW(0, 11),
927 VTW(0, 12),
928 VTW(0, 13),
929 VTW(0, 14),
930 VTW(0, 15),
931 VTW(0, 16),
932 VTW(0, 17),
933 VTW(0, 18),
934 VTW(0, 19),
935 VTW(0, 20),
936 VTW(0, 21),
937 VTW(0, 22),
938 VTW(0, 23),
939 VTW(0, 24),
940 VTW(0, 25),
941 VTW(0, 26),
942 VTW(0, 27),
943 VTW(0, 28),
944 VTW(0, 29),
945 VTW(0, 30),
946 VTW(0, 31),
947 VTW(0, 32),
948 VTW(0, 33),
949 VTW(0, 34),
950 VTW(0, 35),
951 VTW(0, 36),
952 VTW(0, 37),
953 VTW(0, 38),
954 VTW(0, 39),
955 VTW(0, 40),
956 VTW(0, 41),
957 VTW(0, 42),
958 VTW(0, 43),
959 VTW(0, 44),
960 VTW(0, 45),
961 VTW(0, 46),
962 VTW(0, 47),
963 VTW(0, 48),
964 VTW(0, 49),
965 VTW(0, 50),
966 VTW(0, 51),
967 VTW(0, 52),
968 VTW(0, 53),
969 VTW(0, 54),
970 VTW(0, 55),
971 VTW(0, 56),
972 VTW(0, 57),
973 VTW(0, 58),
974 VTW(0, 59),
975 VTW(0, 60),
976 VTW(0, 61),
977 VTW(0, 62),
978 VTW(0, 63),
979 {TW_NEXT, VL, 0}
980 };
981
982 static const ct_desc desc = { 64, XSIMD_STRING("t2fv_64"), twinstr, &GENUS, {261, 126, 258, 0}, 0, 0, 0 };
983
984 void XSIMD(codelet_t2fv_64) (planner *p) {
985 X(kdft_dit_register) (p, t2fv_64, &desc);
986 }
987 #else
988
989 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t2fv_64 -include dft/simd/t2f.h */
990
991 /*
992 * This function contains 519 FP additions, 250 FP multiplications,
993 * (or, 467 additions, 198 multiplications, 52 fused multiply/add),
994 * 107 stack variables, 15 constants, and 128 memory accesses
995 */
996 #include "dft/simd/t2f.h"
997
998 static void t2fv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
999 {
1000 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
1001 DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
1002 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
1003 DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
1004 DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
1005 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
1006 DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
1007 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
1008 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
1009 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
1010 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
1011 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
1012 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
1013 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
1014 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
1015 {
1016 INT m;
1017 R *x;
1018 x = ri;
1019 for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
1020 V Tg, T4a, T6r, T7f, T3o, T4B, T5q, T7e, T5R, T62, T28, T4o, T2g, T4l, T7n;
1021 V T7Z, T68, T6j, T2C, T4s, T3a, T4v, T7u, T82, T7E, T7F, T7V, T5F, T6u, T1k;
1022 V T4e, T1r, T4d, T7B, T7C, T7W, T5M, T6v, TV, T4g, T12, T4h, T7h, T7i, TD;
1023 V T4C, T3h, T4b, T5x, T6s, T1R, T4m, T7q, T80, T2j, T4p, T5Y, T63, T2Z, T4w;
1024 V T7x, T83, T33, T4t, T6f, T6k;
1025 {
1026 V T1, T3, T3m, T3k, Tb, Td, Te, T6, T8, T9, T2, T3l, T3j;
1027 T1 = LD(&(x[0]), ms, &(x[0]));
1028 T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
1029 T3 = BYTWJ(&(W[TWVL * 62]), T2);
1030 T3l = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
1031 T3m = BYTWJ(&(W[TWVL * 94]), T3l);
1032 T3j = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
1033 T3k = BYTWJ(&(W[TWVL * 30]), T3j);
1034 {
1035 V Ta, Tc, T5, T7;
1036 Ta = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
1037 Tb = BYTWJ(&(W[TWVL * 110]), Ta);
1038 Tc = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
1039 Td = BYTWJ(&(W[TWVL * 46]), Tc);
1040 Te = VSUB(Tb, Td);
1041 T5 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
1042 T6 = BYTWJ(&(W[TWVL * 14]), T5);
1043 T7 = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
1044 T8 = BYTWJ(&(W[TWVL * 78]), T7);
1045 T9 = VSUB(T6, T8);
1046 }
1047 {
1048 V T4, Tf, T6p, T6q;
1049 T4 = VSUB(T1, T3);
1050 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
1051 Tg = VADD(T4, Tf);
1052 T4a = VSUB(T4, Tf);
1053 T6p = VADD(Tb, Td);
1054 T6q = VADD(T6, T8);
1055 T6r = VSUB(T6p, T6q);
1056 T7f = VADD(T6q, T6p);
1057 }
1058 {
1059 V T3i, T3n, T5o, T5p;
1060 T3i = VMUL(LDK(KP707106781), VSUB(Te, T9));
1061 T3n = VSUB(T3k, T3m);
1062 T3o = VSUB(T3i, T3n);
1063 T4B = VADD(T3n, T3i);
1064 T5o = VADD(T1, T3);
1065 T5p = VADD(T3k, T3m);
1066 T5q = VSUB(T5o, T5p);
1067 T7e = VADD(T5o, T5p);
1068 }
1069 }
1070 {
1071 V T24, T26, T5Q, T2b, T2d, T5P, T1W, T60, T21, T61, T22, T27;
1072 {
1073 V T23, T25, T2a, T2c;
1074 T23 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
1075 T24 = BYTWJ(&(W[TWVL * 32]), T23);
1076 T25 = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
1077 T26 = BYTWJ(&(W[TWVL * 96]), T25);
1078 T5Q = VADD(T24, T26);
1079 T2a = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
1080 T2b = BYTWJ(&(W[0]), T2a);
1081 T2c = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
1082 T2d = BYTWJ(&(W[TWVL * 64]), T2c);
1083 T5P = VADD(T2b, T2d);
1084 }
1085 {
1086 V T1T, T1V, T1S, T1U;
1087 T1S = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
1088 T1T = BYTWJ(&(W[TWVL * 112]), T1S);
1089 T1U = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
1090 T1V = BYTWJ(&(W[TWVL * 48]), T1U);
1091 T1W = VSUB(T1T, T1V);
1092 T60 = VADD(T1T, T1V);
1093 }
1094 {
1095 V T1Y, T20, T1X, T1Z;
1096 T1X = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
1097 T1Y = BYTWJ(&(W[TWVL * 16]), T1X);
1098 T1Z = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
1099 T20 = BYTWJ(&(W[TWVL * 80]), T1Z);
1100 T21 = VSUB(T1Y, T20);
1101 T61 = VADD(T1Y, T20);
1102 }
1103 T5R = VSUB(T5P, T5Q);
1104 T62 = VSUB(T60, T61);
1105 T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
1106 T27 = VSUB(T24, T26);
1107 T28 = VSUB(T22, T27);
1108 T4o = VADD(T27, T22);
1109 {
1110 V T2e, T2f, T7l, T7m;
1111 T2e = VSUB(T2b, T2d);
1112 T2f = VMUL(LDK(KP707106781), VADD(T21, T1W));
1113 T2g = VADD(T2e, T2f);
1114 T4l = VSUB(T2e, T2f);
1115 T7l = VADD(T5P, T5Q);
1116 T7m = VADD(T61, T60);
1117 T7n = VADD(T7l, T7m);
1118 T7Z = VSUB(T7l, T7m);
1119 }
1120 }
1121 {
1122 V T2n, T2p, T66, T36, T38, T67, T2v, T6i, T2A, T6h, T2q, T2B;
1123 {
1124 V T2m, T2o, T35, T37;
1125 T2m = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
1126 T2n = BYTWJ(&(W[TWVL * 124]), T2m);
1127 T2o = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
1128 T2p = BYTWJ(&(W[TWVL * 60]), T2o);
1129 T66 = VADD(T2n, T2p);
1130 T35 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
1131 T36 = BYTWJ(&(W[TWVL * 28]), T35);
1132 T37 = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
1133 T38 = BYTWJ(&(W[TWVL * 92]), T37);
1134 T67 = VADD(T36, T38);
1135 }
1136 {
1137 V T2s, T2u, T2r, T2t;
1138 T2r = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
1139 T2s = BYTWJ(&(W[TWVL * 12]), T2r);
1140 T2t = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
1141 T2u = BYTWJ(&(W[TWVL * 76]), T2t);
1142 T2v = VSUB(T2s, T2u);
1143 T6i = VADD(T2s, T2u);
1144 }
1145 {
1146 V T2x, T2z, T2w, T2y;
1147 T2w = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
1148 T2x = BYTWJ(&(W[TWVL * 108]), T2w);
1149 T2y = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
1150 T2z = BYTWJ(&(W[TWVL * 44]), T2y);
1151 T2A = VSUB(T2x, T2z);
1152 T6h = VADD(T2x, T2z);
1153 }
1154 T68 = VSUB(T66, T67);
1155 T6j = VSUB(T6h, T6i);
1156 T2q = VSUB(T2n, T2p);
1157 T2B = VMUL(LDK(KP707106781), VADD(T2v, T2A));
1158 T2C = VADD(T2q, T2B);
1159 T4s = VSUB(T2q, T2B);
1160 {
1161 V T34, T39, T7s, T7t;
1162 T34 = VMUL(LDK(KP707106781), VSUB(T2A, T2v));
1163 T39 = VSUB(T36, T38);
1164 T3a = VSUB(T34, T39);
1165 T4v = VADD(T39, T34);
1166 T7s = VADD(T66, T67);
1167 T7t = VADD(T6i, T6h);
1168 T7u = VADD(T7s, T7t);
1169 T82 = VSUB(T7s, T7t);
1170 }
1171 }
1172 {
1173 V T1g, T1i, T5A, T1m, T1o, T5z, T18, T5C, T1d, T5D, T5B, T5E;
1174 {
1175 V T1f, T1h, T1l, T1n;
1176 T1f = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
1177 T1g = BYTWJ(&(W[TWVL * 34]), T1f);
1178 T1h = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
1179 T1i = BYTWJ(&(W[TWVL * 98]), T1h);
1180 T5A = VADD(T1g, T1i);
1181 T1l = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
1182 T1m = BYTWJ(&(W[TWVL * 2]), T1l);
1183 T1n = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
1184 T1o = BYTWJ(&(W[TWVL * 66]), T1n);
1185 T5z = VADD(T1m, T1o);
1186 }
1187 {
1188 V T15, T17, T14, T16;
1189 T14 = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
1190 T15 = BYTWJ(&(W[TWVL * 114]), T14);
1191 T16 = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
1192 T17 = BYTWJ(&(W[TWVL * 50]), T16);
1193 T18 = VSUB(T15, T17);
1194 T5C = VADD(T15, T17);
1195 }
1196 {
1197 V T1a, T1c, T19, T1b;
1198 T19 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
1199 T1a = BYTWJ(&(W[TWVL * 18]), T19);
1200 T1b = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
1201 T1c = BYTWJ(&(W[TWVL * 82]), T1b);
1202 T1d = VSUB(T1a, T1c);
1203 T5D = VADD(T1a, T1c);
1204 }
1205 T7E = VADD(T5z, T5A);
1206 T7F = VADD(T5D, T5C);
1207 T7V = VSUB(T7E, T7F);
1208 T5B = VSUB(T5z, T5A);
1209 T5E = VSUB(T5C, T5D);
1210 T5F = VFMA(LDK(KP923879532), T5B, VMUL(LDK(KP382683432), T5E));
1211 T6u = VFNMS(LDK(KP382683432), T5B, VMUL(LDK(KP923879532), T5E));
1212 {
1213 V T1e, T1j, T1p, T1q;
1214 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
1215 T1j = VSUB(T1g, T1i);
1216 T1k = VSUB(T1e, T1j);
1217 T4e = VADD(T1j, T1e);
1218 T1p = VSUB(T1m, T1o);
1219 T1q = VMUL(LDK(KP707106781), VADD(T1d, T18));
1220 T1r = VADD(T1p, T1q);
1221 T4d = VSUB(T1p, T1q);
1222 }
1223 }
1224 {
1225 V TG, TI, T5G, TY, T10, T5H, TO, T5K, TT, T5J, T5I, T5L;
1226 {
1227 V TF, TH, TX, TZ;
1228 TF = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
1229 TG = BYTWJ(&(W[TWVL * 122]), TF);
1230 TH = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
1231 TI = BYTWJ(&(W[TWVL * 58]), TH);
1232 T5G = VADD(TG, TI);
1233 TX = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
1234 TY = BYTWJ(&(W[TWVL * 26]), TX);
1235 TZ = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
1236 T10 = BYTWJ(&(W[TWVL * 90]), TZ);
1237 T5H = VADD(TY, T10);
1238 }
1239 {
1240 V TL, TN, TK, TM;
1241 TK = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
1242 TL = BYTWJ(&(W[TWVL * 10]), TK);
1243 TM = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
1244 TN = BYTWJ(&(W[TWVL * 74]), TM);
1245 TO = VSUB(TL, TN);
1246 T5K = VADD(TL, TN);
1247 }
1248 {
1249 V TQ, TS, TP, TR;
1250 TP = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
1251 TQ = BYTWJ(&(W[TWVL * 106]), TP);
1252 TR = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
1253 TS = BYTWJ(&(W[TWVL * 42]), TR);
1254 TT = VSUB(TQ, TS);
1255 T5J = VADD(TQ, TS);
1256 }
1257 T7B = VADD(T5G, T5H);
1258 T7C = VADD(T5K, T5J);
1259 T7W = VSUB(T7B, T7C);
1260 T5I = VSUB(T5G, T5H);
1261 T5L = VSUB(T5J, T5K);
1262 T5M = VFNMS(LDK(KP382683432), T5L, VMUL(LDK(KP923879532), T5I));
1263 T6v = VFMA(LDK(KP382683432), T5I, VMUL(LDK(KP923879532), T5L));
1264 {
1265 V TJ, TU, TW, T11;
1266 TJ = VSUB(TG, TI);
1267 TU = VMUL(LDK(KP707106781), VADD(TO, TT));
1268 TV = VADD(TJ, TU);
1269 T4g = VSUB(TJ, TU);
1270 TW = VMUL(LDK(KP707106781), VSUB(TT, TO));
1271 T11 = VSUB(TY, T10);
1272 T12 = VSUB(TW, T11);
1273 T4h = VADD(T11, TW);
1274 }
1275 }
1276 {
1277 V Tl, T5r, TB, T5v, Tq, T5s, Tw, T5u, Tr, TC;
1278 {
1279 V Ti, Tk, Th, Tj;
1280 Th = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
1281 Ti = BYTWJ(&(W[TWVL * 6]), Th);
1282 Tj = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
1283 Tk = BYTWJ(&(W[TWVL * 70]), Tj);
1284 Tl = VSUB(Ti, Tk);
1285 T5r = VADD(Ti, Tk);
1286 }
1287 {
1288 V Ty, TA, Tx, Tz;
1289 Tx = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
1290 Ty = BYTWJ(&(W[TWVL * 22]), Tx);
1291 Tz = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
1292 TA = BYTWJ(&(W[TWVL * 86]), Tz);
1293 TB = VSUB(Ty, TA);
1294 T5v = VADD(Ty, TA);
1295 }
1296 {
1297 V Tn, Tp, Tm, To;
1298 Tm = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
1299 Tn = BYTWJ(&(W[TWVL * 38]), Tm);
1300 To = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
1301 Tp = BYTWJ(&(W[TWVL * 102]), To);
1302 Tq = VSUB(Tn, Tp);
1303 T5s = VADD(Tn, Tp);
1304 }
1305 {
1306 V Tt, Tv, Ts, Tu;
1307 Ts = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
1308 Tt = BYTWJ(&(W[TWVL * 118]), Ts);
1309 Tu = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
1310 Tv = BYTWJ(&(W[TWVL * 54]), Tu);
1311 Tw = VSUB(Tt, Tv);
1312 T5u = VADD(Tt, Tv);
1313 }
1314 T7h = VADD(T5r, T5s);
1315 T7i = VADD(T5u, T5v);
1316 Tr = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
1317 TC = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), TB));
1318 TD = VADD(Tr, TC);
1319 T4C = VSUB(TC, Tr);
1320 {
1321 V T3f, T3g, T5t, T5w;
1322 T3f = VFNMS(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
1323 T3g = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
1324 T3h = VSUB(T3f, T3g);
1325 T4b = VADD(T3g, T3f);
1326 T5t = VSUB(T5r, T5s);
1327 T5w = VSUB(T5u, T5v);
1328 T5x = VMUL(LDK(KP707106781), VADD(T5t, T5w));
1329 T6s = VMUL(LDK(KP707106781), VSUB(T5w, T5t));
1330 }
1331 }
1332 {
1333 V T1z, T5V, T1P, T5T, T1E, T5W, T1K, T5S;
1334 {
1335 V T1w, T1y, T1v, T1x;
1336 T1v = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
1337 T1w = BYTWJ(&(W[TWVL * 120]), T1v);
1338 T1x = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
1339 T1y = BYTWJ(&(W[TWVL * 56]), T1x);
1340 T1z = VSUB(T1w, T1y);
1341 T5V = VADD(T1w, T1y);
1342 }
1343 {
1344 V T1M, T1O, T1L, T1N;
1345 T1L = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
1346 T1M = BYTWJ(&(W[TWVL * 40]), T1L);
1347 T1N = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
1348 T1O = BYTWJ(&(W[TWVL * 104]), T1N);
1349 T1P = VSUB(T1M, T1O);
1350 T5T = VADD(T1M, T1O);
1351 }
1352 {
1353 V T1B, T1D, T1A, T1C;
1354 T1A = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
1355 T1B = BYTWJ(&(W[TWVL * 24]), T1A);
1356 T1C = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
1357 T1D = BYTWJ(&(W[TWVL * 88]), T1C);
1358 T1E = VSUB(T1B, T1D);
1359 T5W = VADD(T1B, T1D);
1360 }
1361 {
1362 V T1H, T1J, T1G, T1I;
1363 T1G = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
1364 T1H = BYTWJ(&(W[TWVL * 8]), T1G);
1365 T1I = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
1366 T1J = BYTWJ(&(W[TWVL * 72]), T1I);
1367 T1K = VSUB(T1H, T1J);
1368 T5S = VADD(T1H, T1J);
1369 }
1370 {
1371 V T1F, T1Q, T7o, T7p;
1372 T1F = VFNMS(LDK(KP923879532), T1E, VMUL(LDK(KP382683432), T1z));
1373 T1Q = VFMA(LDK(KP382683432), T1K, VMUL(LDK(KP923879532), T1P));
1374 T1R = VSUB(T1F, T1Q);
1375 T4m = VADD(T1Q, T1F);
1376 T7o = VADD(T5S, T5T);
1377 T7p = VADD(T5V, T5W);
1378 T7q = VADD(T7o, T7p);
1379 T80 = VSUB(T7p, T7o);
1380 }
1381 {
1382 V T2h, T2i, T5U, T5X;
1383 T2h = VFNMS(LDK(KP382683432), T1P, VMUL(LDK(KP923879532), T1K));
1384 T2i = VFMA(LDK(KP923879532), T1z, VMUL(LDK(KP382683432), T1E));
1385 T2j = VADD(T2h, T2i);
1386 T4p = VSUB(T2i, T2h);
1387 T5U = VSUB(T5S, T5T);
1388 T5X = VSUB(T5V, T5W);
1389 T5Y = VMUL(LDK(KP707106781), VADD(T5U, T5X));
1390 T63 = VMUL(LDK(KP707106781), VSUB(T5X, T5U));
1391 }
1392 }
1393 {
1394 V T2H, T69, T2X, T6d, T2M, T6a, T2S, T6c;
1395 {
1396 V T2E, T2G, T2D, T2F;
1397 T2D = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
1398 T2E = BYTWJ(&(W[TWVL * 4]), T2D);
1399 T2F = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
1400 T2G = BYTWJ(&(W[TWVL * 68]), T2F);
1401 T2H = VSUB(T2E, T2G);
1402 T69 = VADD(T2E, T2G);
1403 }
1404 {
1405 V T2U, T2W, T2T, T2V;
1406 T2T = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
1407 T2U = BYTWJ(&(W[TWVL * 20]), T2T);
1408 T2V = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
1409 T2W = BYTWJ(&(W[TWVL * 84]), T2V);
1410 T2X = VSUB(T2U, T2W);
1411 T6d = VADD(T2U, T2W);
1412 }
1413 {
1414 V T2J, T2L, T2I, T2K;
1415 T2I = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
1416 T2J = BYTWJ(&(W[TWVL * 36]), T2I);
1417 T2K = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
1418 T2L = BYTWJ(&(W[TWVL * 100]), T2K);
1419 T2M = VSUB(T2J, T2L);
1420 T6a = VADD(T2J, T2L);
1421 }
1422 {
1423 V T2P, T2R, T2O, T2Q;
1424 T2O = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
1425 T2P = BYTWJ(&(W[TWVL * 116]), T2O);
1426 T2Q = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
1427 T2R = BYTWJ(&(W[TWVL * 52]), T2Q);
1428 T2S = VSUB(T2P, T2R);
1429 T6c = VADD(T2P, T2R);
1430 }
1431 {
1432 V T2N, T2Y, T7v, T7w;
1433 T2N = VFNMS(LDK(KP382683432), T2M, VMUL(LDK(KP923879532), T2H));
1434 T2Y = VFMA(LDK(KP923879532), T2S, VMUL(LDK(KP382683432), T2X));
1435 T2Z = VADD(T2N, T2Y);
1436 T4w = VSUB(T2Y, T2N);
1437 T7v = VADD(T69, T6a);
1438 T7w = VADD(T6c, T6d);
1439 T7x = VADD(T7v, T7w);
1440 T83 = VSUB(T7w, T7v);
1441 }
1442 {
1443 V T31, T32, T6b, T6e;
1444 T31 = VFNMS(LDK(KP923879532), T2X, VMUL(LDK(KP382683432), T2S));
1445 T32 = VFMA(LDK(KP382683432), T2H, VMUL(LDK(KP923879532), T2M));
1446 T33 = VSUB(T31, T32);
1447 T4t = VADD(T32, T31);
1448 T6b = VSUB(T69, T6a);
1449 T6e = VSUB(T6c, T6d);
1450 T6f = VMUL(LDK(KP707106781), VADD(T6b, T6e));
1451 T6k = VMUL(LDK(KP707106781), VSUB(T6e, T6b));
1452 }
1453 }
1454 {
1455 V T7k, T7M, T7R, T7T, T7z, T7I, T7H, T7N, T7O, T7S;
1456 {
1457 V T7g, T7j, T7P, T7Q;
1458 T7g = VADD(T7e, T7f);
1459 T7j = VADD(T7h, T7i);
1460 T7k = VSUB(T7g, T7j);
1461 T7M = VADD(T7g, T7j);
1462 T7P = VADD(T7n, T7q);
1463 T7Q = VADD(T7u, T7x);
1464 T7R = VADD(T7P, T7Q);
1465 T7T = VBYI(VSUB(T7Q, T7P));
1466 }
1467 {
1468 V T7r, T7y, T7D, T7G;
1469 T7r = VSUB(T7n, T7q);
1470 T7y = VSUB(T7u, T7x);
1471 T7z = VMUL(LDK(KP707106781), VADD(T7r, T7y));
1472 T7I = VMUL(LDK(KP707106781), VSUB(T7y, T7r));
1473 T7D = VADD(T7B, T7C);
1474 T7G = VADD(T7E, T7F);
1475 T7H = VSUB(T7D, T7G);
1476 T7N = VADD(T7G, T7D);
1477 }
1478 T7O = VADD(T7M, T7N);
1479 ST(&(x[WS(rs, 32)]), VSUB(T7O, T7R), ms, &(x[0]));
1480 ST(&(x[0]), VADD(T7O, T7R), ms, &(x[0]));
1481 T7S = VSUB(T7M, T7N);
1482 ST(&(x[WS(rs, 48)]), VSUB(T7S, T7T), ms, &(x[0]));
1483 ST(&(x[WS(rs, 16)]), VADD(T7S, T7T), ms, &(x[0]));
1484 {
1485 V T7A, T7J, T7K, T7L;
1486 T7A = VADD(T7k, T7z);
1487 T7J = VBYI(VADD(T7H, T7I));
1488 ST(&(x[WS(rs, 56)]), VSUB(T7A, T7J), ms, &(x[0]));
1489 ST(&(x[WS(rs, 8)]), VADD(T7A, T7J), ms, &(x[0]));
1490 T7K = VSUB(T7k, T7z);
1491 T7L = VBYI(VSUB(T7I, T7H));
1492 ST(&(x[WS(rs, 40)]), VSUB(T7K, T7L), ms, &(x[0]));
1493 ST(&(x[WS(rs, 24)]), VADD(T7K, T7L), ms, &(x[0]));
1494 }
1495 }
1496 {
1497 V T7Y, T8j, T8c, T8k, T85, T8g, T89, T8h;
1498 {
1499 V T7U, T7X, T8a, T8b;
1500 T7U = VSUB(T7e, T7f);
1501 T7X = VMUL(LDK(KP707106781), VADD(T7V, T7W));
1502 T7Y = VADD(T7U, T7X);
1503 T8j = VSUB(T7U, T7X);
1504 T8a = VFNMS(LDK(KP382683432), T7Z, VMUL(LDK(KP923879532), T80));
1505 T8b = VFMA(LDK(KP382683432), T82, VMUL(LDK(KP923879532), T83));
1506 T8c = VADD(T8a, T8b);
1507 T8k = VSUB(T8b, T8a);
1508 }
1509 {
1510 V T81, T84, T87, T88;
1511 T81 = VFMA(LDK(KP923879532), T7Z, VMUL(LDK(KP382683432), T80));
1512 T84 = VFNMS(LDK(KP382683432), T83, VMUL(LDK(KP923879532), T82));
1513 T85 = VADD(T81, T84);
1514 T8g = VSUB(T84, T81);
1515 T87 = VSUB(T7i, T7h);
1516 T88 = VMUL(LDK(KP707106781), VSUB(T7W, T7V));
1517 T89 = VADD(T87, T88);
1518 T8h = VSUB(T88, T87);
1519 }
1520 {
1521 V T86, T8d, T8m, T8n;
1522 T86 = VADD(T7Y, T85);
1523 T8d = VBYI(VADD(T89, T8c));
1524 ST(&(x[WS(rs, 60)]), VSUB(T86, T8d), ms, &(x[0]));
1525 ST(&(x[WS(rs, 4)]), VADD(T86, T8d), ms, &(x[0]));
1526 T8m = VBYI(VADD(T8h, T8g));
1527 T8n = VADD(T8j, T8k);
1528 ST(&(x[WS(rs, 12)]), VADD(T8m, T8n), ms, &(x[0]));
1529 ST(&(x[WS(rs, 52)]), VSUB(T8n, T8m), ms, &(x[0]));
1530 }
1531 {
1532 V T8e, T8f, T8i, T8l;
1533 T8e = VSUB(T7Y, T85);
1534 T8f = VBYI(VSUB(T8c, T89));
1535 ST(&(x[WS(rs, 36)]), VSUB(T8e, T8f), ms, &(x[0]));
1536 ST(&(x[WS(rs, 28)]), VADD(T8e, T8f), ms, &(x[0]));
1537 T8i = VBYI(VSUB(T8g, T8h));
1538 T8l = VSUB(T8j, T8k);
1539 ST(&(x[WS(rs, 20)]), VADD(T8i, T8l), ms, &(x[0]));
1540 ST(&(x[WS(rs, 44)]), VSUB(T8l, T8i), ms, &(x[0]));
1541 }
1542 }
1543 {
1544 V T5O, T6H, T6x, T6F, T6n, T6I, T6A, T6E;
1545 {
1546 V T5y, T5N, T6t, T6w;
1547 T5y = VADD(T5q, T5x);
1548 T5N = VADD(T5F, T5M);
1549 T5O = VADD(T5y, T5N);
1550 T6H = VSUB(T5y, T5N);
1551 T6t = VADD(T6r, T6s);
1552 T6w = VADD(T6u, T6v);
1553 T6x = VADD(T6t, T6w);
1554 T6F = VSUB(T6w, T6t);
1555 {
1556 V T65, T6y, T6m, T6z;
1557 {
1558 V T5Z, T64, T6g, T6l;
1559 T5Z = VADD(T5R, T5Y);
1560 T64 = VADD(T62, T63);
1561 T65 = VFMA(LDK(KP980785280), T5Z, VMUL(LDK(KP195090322), T64));
1562 T6y = VFNMS(LDK(KP195090322), T5Z, VMUL(LDK(KP980785280), T64));
1563 T6g = VADD(T68, T6f);
1564 T6l = VADD(T6j, T6k);
1565 T6m = VFNMS(LDK(KP195090322), T6l, VMUL(LDK(KP980785280), T6g));
1566 T6z = VFMA(LDK(KP195090322), T6g, VMUL(LDK(KP980785280), T6l));
1567 }
1568 T6n = VADD(T65, T6m);
1569 T6I = VSUB(T6z, T6y);
1570 T6A = VADD(T6y, T6z);
1571 T6E = VSUB(T6m, T65);
1572 }
1573 }
1574 {
1575 V T6o, T6B, T6K, T6L;
1576 T6o = VADD(T5O, T6n);
1577 T6B = VBYI(VADD(T6x, T6A));
1578 ST(&(x[WS(rs, 62)]), VSUB(T6o, T6B), ms, &(x[0]));
1579 ST(&(x[WS(rs, 2)]), VADD(T6o, T6B), ms, &(x[0]));
1580 T6K = VBYI(VADD(T6F, T6E));
1581 T6L = VADD(T6H, T6I);
1582 ST(&(x[WS(rs, 14)]), VADD(T6K, T6L), ms, &(x[0]));
1583 ST(&(x[WS(rs, 50)]), VSUB(T6L, T6K), ms, &(x[0]));
1584 }
1585 {
1586 V T6C, T6D, T6G, T6J;
1587 T6C = VSUB(T5O, T6n);
1588 T6D = VBYI(VSUB(T6A, T6x));
1589 ST(&(x[WS(rs, 34)]), VSUB(T6C, T6D), ms, &(x[0]));
1590 ST(&(x[WS(rs, 30)]), VADD(T6C, T6D), ms, &(x[0]));
1591 T6G = VBYI(VSUB(T6E, T6F));
1592 T6J = VSUB(T6H, T6I);
1593 ST(&(x[WS(rs, 18)]), VADD(T6G, T6J), ms, &(x[0]));
1594 ST(&(x[WS(rs, 46)]), VSUB(T6J, T6G), ms, &(x[0]));
1595 }
1596 }
1597 {
1598 V T6O, T79, T6Z, T77, T6V, T7a, T72, T76;
1599 {
1600 V T6M, T6N, T6X, T6Y;
1601 T6M = VSUB(T5q, T5x);
1602 T6N = VSUB(T6v, T6u);
1603 T6O = VADD(T6M, T6N);
1604 T79 = VSUB(T6M, T6N);
1605 T6X = VSUB(T6s, T6r);
1606 T6Y = VSUB(T5M, T5F);
1607 T6Z = VADD(T6X, T6Y);
1608 T77 = VSUB(T6Y, T6X);
1609 {
1610 V T6R, T70, T6U, T71;
1611 {
1612 V T6P, T6Q, T6S, T6T;
1613 T6P = VSUB(T5R, T5Y);
1614 T6Q = VSUB(T63, T62);
1615 T6R = VFMA(LDK(KP831469612), T6P, VMUL(LDK(KP555570233), T6Q));
1616 T70 = VFNMS(LDK(KP555570233), T6P, VMUL(LDK(KP831469612), T6Q));
1617 T6S = VSUB(T68, T6f);
1618 T6T = VSUB(T6k, T6j);
1619 T6U = VFNMS(LDK(KP555570233), T6T, VMUL(LDK(KP831469612), T6S));
1620 T71 = VFMA(LDK(KP555570233), T6S, VMUL(LDK(KP831469612), T6T));
1621 }
1622 T6V = VADD(T6R, T6U);
1623 T7a = VSUB(T71, T70);
1624 T72 = VADD(T70, T71);
1625 T76 = VSUB(T6U, T6R);
1626 }
1627 }
1628 {
1629 V T6W, T73, T7c, T7d;
1630 T6W = VADD(T6O, T6V);
1631 T73 = VBYI(VADD(T6Z, T72));
1632 ST(&(x[WS(rs, 58)]), VSUB(T6W, T73), ms, &(x[0]));
1633 ST(&(x[WS(rs, 6)]), VADD(T6W, T73), ms, &(x[0]));
1634 T7c = VBYI(VADD(T77, T76));
1635 T7d = VADD(T79, T7a);
1636 ST(&(x[WS(rs, 10)]), VADD(T7c, T7d), ms, &(x[0]));
1637 ST(&(x[WS(rs, 54)]), VSUB(T7d, T7c), ms, &(x[0]));
1638 }
1639 {
1640 V T74, T75, T78, T7b;
1641 T74 = VSUB(T6O, T6V);
1642 T75 = VBYI(VSUB(T72, T6Z));
1643 ST(&(x[WS(rs, 38)]), VSUB(T74, T75), ms, &(x[0]));
1644 ST(&(x[WS(rs, 26)]), VADD(T74, T75), ms, &(x[0]));
1645 T78 = VBYI(VSUB(T76, T77));
1646 T7b = VSUB(T79, T7a);
1647 ST(&(x[WS(rs, 22)]), VADD(T78, T7b), ms, &(x[0]));
1648 ST(&(x[WS(rs, 42)]), VSUB(T7b, T78), ms, &(x[0]));
1649 }
1650 }
1651 {
1652 V T4k, T5h, T4R, T59, T4H, T5j, T4P, T4Y, T4z, T4S, T4K, T4O, T55, T5k, T5c;
1653 V T5g;
1654 {
1655 V T4c, T57, T4j, T58, T4f, T4i;
1656 T4c = VADD(T4a, T4b);
1657 T57 = VSUB(T4C, T4B);
1658 T4f = VFMA(LDK(KP831469612), T4d, VMUL(LDK(KP555570233), T4e));
1659 T4i = VFNMS(LDK(KP555570233), T4h, VMUL(LDK(KP831469612), T4g));
1660 T4j = VADD(T4f, T4i);
1661 T58 = VSUB(T4i, T4f);
1662 T4k = VADD(T4c, T4j);
1663 T5h = VSUB(T58, T57);
1664 T4R = VSUB(T4c, T4j);
1665 T59 = VADD(T57, T58);
1666 }
1667 {
1668 V T4D, T4W, T4G, T4X, T4E, T4F;
1669 T4D = VADD(T4B, T4C);
1670 T4W = VSUB(T4a, T4b);
1671 T4E = VFNMS(LDK(KP555570233), T4d, VMUL(LDK(KP831469612), T4e));
1672 T4F = VFMA(LDK(KP555570233), T4g, VMUL(LDK(KP831469612), T4h));
1673 T4G = VADD(T4E, T4F);
1674 T4X = VSUB(T4F, T4E);
1675 T4H = VADD(T4D, T4G);
1676 T5j = VSUB(T4W, T4X);
1677 T4P = VSUB(T4G, T4D);
1678 T4Y = VADD(T4W, T4X);
1679 }
1680 {
1681 V T4r, T4I, T4y, T4J;
1682 {
1683 V T4n, T4q, T4u, T4x;
1684 T4n = VADD(T4l, T4m);
1685 T4q = VADD(T4o, T4p);
1686 T4r = VFMA(LDK(KP956940335), T4n, VMUL(LDK(KP290284677), T4q));
1687 T4I = VFNMS(LDK(KP290284677), T4n, VMUL(LDK(KP956940335), T4q));
1688 T4u = VADD(T4s, T4t);
1689 T4x = VADD(T4v, T4w);
1690 T4y = VFNMS(LDK(KP290284677), T4x, VMUL(LDK(KP956940335), T4u));
1691 T4J = VFMA(LDK(KP290284677), T4u, VMUL(LDK(KP956940335), T4x));
1692 }
1693 T4z = VADD(T4r, T4y);
1694 T4S = VSUB(T4J, T4I);
1695 T4K = VADD(T4I, T4J);
1696 T4O = VSUB(T4y, T4r);
1697 }
1698 {
1699 V T51, T5a, T54, T5b;
1700 {
1701 V T4Z, T50, T52, T53;
1702 T4Z = VSUB(T4l, T4m);
1703 T50 = VSUB(T4p, T4o);
1704 T51 = VFMA(LDK(KP881921264), T4Z, VMUL(LDK(KP471396736), T50));
1705 T5a = VFNMS(LDK(KP471396736), T4Z, VMUL(LDK(KP881921264), T50));
1706 T52 = VSUB(T4s, T4t);
1707 T53 = VSUB(T4w, T4v);
1708 T54 = VFNMS(LDK(KP471396736), T53, VMUL(LDK(KP881921264), T52));
1709 T5b = VFMA(LDK(KP471396736), T52, VMUL(LDK(KP881921264), T53));
1710 }
1711 T55 = VADD(T51, T54);
1712 T5k = VSUB(T5b, T5a);
1713 T5c = VADD(T5a, T5b);
1714 T5g = VSUB(T54, T51);
1715 }
1716 {
1717 V T4A, T4L, T5i, T5l;
1718 T4A = VADD(T4k, T4z);
1719 T4L = VBYI(VADD(T4H, T4K));
1720 ST(&(x[WS(rs, 61)]), VSUB(T4A, T4L), ms, &(x[WS(rs, 1)]));
1721 ST(&(x[WS(rs, 3)]), VADD(T4A, T4L), ms, &(x[WS(rs, 1)]));
1722 T5i = VBYI(VSUB(T5g, T5h));
1723 T5l = VSUB(T5j, T5k);
1724 ST(&(x[WS(rs, 21)]), VADD(T5i, T5l), ms, &(x[WS(rs, 1)]));
1725 ST(&(x[WS(rs, 43)]), VSUB(T5l, T5i), ms, &(x[WS(rs, 1)]));
1726 }
1727 {
1728 V T5m, T5n, T4M, T4N;
1729 T5m = VBYI(VADD(T5h, T5g));
1730 T5n = VADD(T5j, T5k);
1731 ST(&(x[WS(rs, 11)]), VADD(T5m, T5n), ms, &(x[WS(rs, 1)]));
1732 ST(&(x[WS(rs, 53)]), VSUB(T5n, T5m), ms, &(x[WS(rs, 1)]));
1733 T4M = VSUB(T4k, T4z);
1734 T4N = VBYI(VSUB(T4K, T4H));
1735 ST(&(x[WS(rs, 35)]), VSUB(T4M, T4N), ms, &(x[WS(rs, 1)]));
1736 ST(&(x[WS(rs, 29)]), VADD(T4M, T4N), ms, &(x[WS(rs, 1)]));
1737 }
1738 {
1739 V T4Q, T4T, T56, T5d;
1740 T4Q = VBYI(VSUB(T4O, T4P));
1741 T4T = VSUB(T4R, T4S);
1742 ST(&(x[WS(rs, 19)]), VADD(T4Q, T4T), ms, &(x[WS(rs, 1)]));
1743 ST(&(x[WS(rs, 45)]), VSUB(T4T, T4Q), ms, &(x[WS(rs, 1)]));
1744 T56 = VADD(T4Y, T55);
1745 T5d = VBYI(VADD(T59, T5c));
1746 ST(&(x[WS(rs, 59)]), VSUB(T56, T5d), ms, &(x[WS(rs, 1)]));
1747 ST(&(x[WS(rs, 5)]), VADD(T56, T5d), ms, &(x[WS(rs, 1)]));
1748 }
1749 {
1750 V T5e, T5f, T4U, T4V;
1751 T5e = VSUB(T4Y, T55);
1752 T5f = VBYI(VSUB(T5c, T59));
1753 ST(&(x[WS(rs, 37)]), VSUB(T5e, T5f), ms, &(x[WS(rs, 1)]));
1754 ST(&(x[WS(rs, 27)]), VADD(T5e, T5f), ms, &(x[WS(rs, 1)]));
1755 T4U = VBYI(VADD(T4P, T4O));
1756 T4V = VADD(T4R, T4S);
1757 ST(&(x[WS(rs, 13)]), VADD(T4U, T4V), ms, &(x[WS(rs, 1)]));
1758 ST(&(x[WS(rs, 51)]), VSUB(T4V, T4U), ms, &(x[WS(rs, 1)]));
1759 }
1760 }
1761 {
1762 V T1u, T43, T3D, T3V, T3t, T45, T3B, T3K, T3d, T3E, T3w, T3A, T3R, T46, T3Y;
1763 V T42;
1764 {
1765 V TE, T3T, T1t, T3U, T13, T1s;
1766 TE = VSUB(Tg, TD);
1767 T3T = VADD(T3o, T3h);
1768 T13 = VFMA(LDK(KP195090322), TV, VMUL(LDK(KP980785280), T12));
1769 T1s = VFNMS(LDK(KP195090322), T1r, VMUL(LDK(KP980785280), T1k));
1770 T1t = VSUB(T13, T1s);
1771 T3U = VADD(T1s, T13);
1772 T1u = VADD(TE, T1t);
1773 T43 = VSUB(T3U, T3T);
1774 T3D = VSUB(TE, T1t);
1775 T3V = VADD(T3T, T3U);
1776 }
1777 {
1778 V T3p, T3I, T3s, T3J, T3q, T3r;
1779 T3p = VSUB(T3h, T3o);
1780 T3I = VADD(Tg, TD);
1781 T3q = VFNMS(LDK(KP195090322), T12, VMUL(LDK(KP980785280), TV));
1782 T3r = VFMA(LDK(KP980785280), T1r, VMUL(LDK(KP195090322), T1k));
1783 T3s = VSUB(T3q, T3r);
1784 T3J = VADD(T3r, T3q);
1785 T3t = VADD(T3p, T3s);
1786 T45 = VSUB(T3I, T3J);
1787 T3B = VSUB(T3s, T3p);
1788 T3K = VADD(T3I, T3J);
1789 }
1790 {
1791 V T2l, T3u, T3c, T3v;
1792 {
1793 V T29, T2k, T30, T3b;
1794 T29 = VSUB(T1R, T28);
1795 T2k = VSUB(T2g, T2j);
1796 T2l = VFMA(LDK(KP634393284), T29, VMUL(LDK(KP773010453), T2k));
1797 T3u = VFNMS(LDK(KP634393284), T2k, VMUL(LDK(KP773010453), T29));
1798 T30 = VSUB(T2C, T2Z);
1799 T3b = VSUB(T33, T3a);
1800 T3c = VFNMS(LDK(KP634393284), T3b, VMUL(LDK(KP773010453), T30));
1801 T3v = VFMA(LDK(KP773010453), T3b, VMUL(LDK(KP634393284), T30));
1802 }
1803 T3d = VADD(T2l, T3c);
1804 T3E = VSUB(T3v, T3u);
1805 T3w = VADD(T3u, T3v);
1806 T3A = VSUB(T3c, T2l);
1807 }
1808 {
1809 V T3N, T3W, T3Q, T3X;
1810 {
1811 V T3L, T3M, T3O, T3P;
1812 T3L = VADD(T28, T1R);
1813 T3M = VADD(T2g, T2j);
1814 T3N = VFMA(LDK(KP098017140), T3L, VMUL(LDK(KP995184726), T3M));
1815 T3W = VFNMS(LDK(KP098017140), T3M, VMUL(LDK(KP995184726), T3L));
1816 T3O = VADD(T2C, T2Z);
1817 T3P = VADD(T3a, T33);
1818 T3Q = VFNMS(LDK(KP098017140), T3P, VMUL(LDK(KP995184726), T3O));
1819 T3X = VFMA(LDK(KP995184726), T3P, VMUL(LDK(KP098017140), T3O));
1820 }
1821 T3R = VADD(T3N, T3Q);
1822 T46 = VSUB(T3X, T3W);
1823 T3Y = VADD(T3W, T3X);
1824 T42 = VSUB(T3Q, T3N);
1825 }
1826 {
1827 V T3e, T3x, T44, T47;
1828 T3e = VADD(T1u, T3d);
1829 T3x = VBYI(VADD(T3t, T3w));
1830 ST(&(x[WS(rs, 57)]), VSUB(T3e, T3x), ms, &(x[WS(rs, 1)]));
1831 ST(&(x[WS(rs, 7)]), VADD(T3e, T3x), ms, &(x[WS(rs, 1)]));
1832 T44 = VBYI(VSUB(T42, T43));
1833 T47 = VSUB(T45, T46);
1834 ST(&(x[WS(rs, 17)]), VADD(T44, T47), ms, &(x[WS(rs, 1)]));
1835 ST(&(x[WS(rs, 47)]), VSUB(T47, T44), ms, &(x[WS(rs, 1)]));
1836 }
1837 {
1838 V T48, T49, T3y, T3z;
1839 T48 = VBYI(VADD(T43, T42));
1840 T49 = VADD(T45, T46);
1841 ST(&(x[WS(rs, 15)]), VADD(T48, T49), ms, &(x[WS(rs, 1)]));
1842 ST(&(x[WS(rs, 49)]), VSUB(T49, T48), ms, &(x[WS(rs, 1)]));
1843 T3y = VSUB(T1u, T3d);
1844 T3z = VBYI(VSUB(T3w, T3t));
1845 ST(&(x[WS(rs, 39)]), VSUB(T3y, T3z), ms, &(x[WS(rs, 1)]));
1846 ST(&(x[WS(rs, 25)]), VADD(T3y, T3z), ms, &(x[WS(rs, 1)]));
1847 }
1848 {
1849 V T3C, T3F, T3S, T3Z;
1850 T3C = VBYI(VSUB(T3A, T3B));
1851 T3F = VSUB(T3D, T3E);
1852 ST(&(x[WS(rs, 23)]), VADD(T3C, T3F), ms, &(x[WS(rs, 1)]));
1853 ST(&(x[WS(rs, 41)]), VSUB(T3F, T3C), ms, &(x[WS(rs, 1)]));
1854 T3S = VADD(T3K, T3R);
1855 T3Z = VBYI(VADD(T3V, T3Y));
1856 ST(&(x[WS(rs, 63)]), VSUB(T3S, T3Z), ms, &(x[WS(rs, 1)]));
1857 ST(&(x[WS(rs, 1)]), VADD(T3S, T3Z), ms, &(x[WS(rs, 1)]));
1858 }
1859 {
1860 V T40, T41, T3G, T3H;
1861 T40 = VSUB(T3K, T3R);
1862 T41 = VBYI(VSUB(T3Y, T3V));
1863 ST(&(x[WS(rs, 33)]), VSUB(T40, T41), ms, &(x[WS(rs, 1)]));
1864 ST(&(x[WS(rs, 31)]), VADD(T40, T41), ms, &(x[WS(rs, 1)]));
1865 T3G = VBYI(VADD(T3B, T3A));
1866 T3H = VADD(T3D, T3E);
1867 ST(&(x[WS(rs, 9)]), VADD(T3G, T3H), ms, &(x[WS(rs, 1)]));
1868 ST(&(x[WS(rs, 55)]), VSUB(T3H, T3G), ms, &(x[WS(rs, 1)]));
1869 }
1870 }
1871 }
1872 }
1873 VLEAVE();
1874 }
1875
1876 static const tw_instr twinstr[] = {
1877 VTW(0, 1),
1878 VTW(0, 2),
1879 VTW(0, 3),
1880 VTW(0, 4),
1881 VTW(0, 5),
1882 VTW(0, 6),
1883 VTW(0, 7),
1884 VTW(0, 8),
1885 VTW(0, 9),
1886 VTW(0, 10),
1887 VTW(0, 11),
1888 VTW(0, 12),
1889 VTW(0, 13),
1890 VTW(0, 14),
1891 VTW(0, 15),
1892 VTW(0, 16),
1893 VTW(0, 17),
1894 VTW(0, 18),
1895 VTW(0, 19),
1896 VTW(0, 20),
1897 VTW(0, 21),
1898 VTW(0, 22),
1899 VTW(0, 23),
1900 VTW(0, 24),
1901 VTW(0, 25),
1902 VTW(0, 26),
1903 VTW(0, 27),
1904 VTW(0, 28),
1905 VTW(0, 29),
1906 VTW(0, 30),
1907 VTW(0, 31),
1908 VTW(0, 32),
1909 VTW(0, 33),
1910 VTW(0, 34),
1911 VTW(0, 35),
1912 VTW(0, 36),
1913 VTW(0, 37),
1914 VTW(0, 38),
1915 VTW(0, 39),
1916 VTW(0, 40),
1917 VTW(0, 41),
1918 VTW(0, 42),
1919 VTW(0, 43),
1920 VTW(0, 44),
1921 VTW(0, 45),
1922 VTW(0, 46),
1923 VTW(0, 47),
1924 VTW(0, 48),
1925 VTW(0, 49),
1926 VTW(0, 50),
1927 VTW(0, 51),
1928 VTW(0, 52),
1929 VTW(0, 53),
1930 VTW(0, 54),
1931 VTW(0, 55),
1932 VTW(0, 56),
1933 VTW(0, 57),
1934 VTW(0, 58),
1935 VTW(0, 59),
1936 VTW(0, 60),
1937 VTW(0, 61),
1938 VTW(0, 62),
1939 VTW(0, 63),
1940 {TW_NEXT, VL, 0}
1941 };
1942
1943 static const ct_desc desc = { 64, XSIMD_STRING("t2fv_64"), twinstr, &GENUS, {467, 198, 52, 0}, 0, 0, 0 };
1944
1945 void XSIMD(codelet_t2fv_64) (planner *p) {
1946 X(kdft_dit_register) (p, t2fv_64, &desc);
1947 }
1948 #endif