comparison src/fftw-3.3.8/dft/simd/common/t1sv_32.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:06:10 EDT 2018 */
23
24 #include "dft/codelet-dft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1sv_32 -include dft/simd/ts.h */
29
30 /*
31 * This function contains 434 FP additions, 260 FP multiplications,
32 * (or, 236 additions, 62 multiplications, 198 fused multiply/add),
33 * 102 stack variables, 7 constants, and 128 memory accesses
34 */
35 #include "dft/simd/ts.h"
36
37 static void t1sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
40 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
41 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
42 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
44 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
46 {
47 INT m;
48 for (m = mb, W = W + (mb * 62); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 62), MAKE_VOLATILE_STRIDE(64, rs)) {
49 V T8, T8x, T3w, T87, Tl, T8y, T3B, T83, Tz, T6F, T3J, T5T, TM, T6G, T3Q;
50 V T5U, T11, T1e, T6M, T6J, T6K, T6L, T3Z, T5X, T46, T5Y, T1s, T1F, T6O, T6P;
51 V T6Q, T6R, T4e, T60, T4l, T61, T32, T7b, T78, T7N, T54, T6f, T5r, T6c, T29;
52 V T70, T6X, T7I, T4v, T68, T4S, T65, T3t, T79, T7e, T7O, T5b, T5s, T5i, T5t;
53 V T2A, T6Y, T73, T7J, T4C, T4T, T4J, T4U;
54 {
55 V T1, T86, T3, T6, T4, T84, T2, T7, T85, T5;
56 T1 = LD(&(ri[0]), ms, &(ri[0]));
57 T86 = LD(&(ii[0]), ms, &(ii[0]));
58 T3 = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
59 T6 = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
60 T2 = LDW(&(W[TWVL * 30]));
61 T4 = VMUL(T2, T3);
62 T84 = VMUL(T2, T6);
63 T5 = LDW(&(W[TWVL * 31]));
64 T7 = VFMA(T5, T6, T4);
65 T85 = VFNMS(T5, T3, T84);
66 T8 = VADD(T1, T7);
67 T8x = VSUB(T86, T85);
68 T3w = VSUB(T1, T7);
69 T87 = VADD(T85, T86);
70 }
71 {
72 V Ta, Td, Tb, T3x, Tg, Tj, Th, T3z, T9, Tf;
73 Ta = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
74 Td = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
75 T9 = LDW(&(W[TWVL * 14]));
76 Tb = VMUL(T9, Ta);
77 T3x = VMUL(T9, Td);
78 Tg = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
79 Tj = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
80 Tf = LDW(&(W[TWVL * 46]));
81 Th = VMUL(Tf, Tg);
82 T3z = VMUL(Tf, Tj);
83 {
84 V Te, T3y, Tk, T3A, Tc, Ti;
85 Tc = LDW(&(W[TWVL * 15]));
86 Te = VFMA(Tc, Td, Tb);
87 T3y = VFNMS(Tc, Ta, T3x);
88 Ti = LDW(&(W[TWVL * 47]));
89 Tk = VFMA(Ti, Tj, Th);
90 T3A = VFNMS(Ti, Tg, T3z);
91 Tl = VADD(Te, Tk);
92 T8y = VSUB(Te, Tk);
93 T3B = VSUB(T3y, T3A);
94 T83 = VADD(T3y, T3A);
95 }
96 }
97 {
98 V Ts, T3F, Ty, T3H, T3D, T3I;
99 {
100 V To, Tr, Tp, T3E, Tn, Tq;
101 To = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
102 Tr = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
103 Tn = LDW(&(W[TWVL * 6]));
104 Tp = VMUL(Tn, To);
105 T3E = VMUL(Tn, Tr);
106 Tq = LDW(&(W[TWVL * 7]));
107 Ts = VFMA(Tq, Tr, Tp);
108 T3F = VFNMS(Tq, To, T3E);
109 }
110 {
111 V Tu, Tx, Tv, T3G, Tt, Tw;
112 Tu = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
113 Tx = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
114 Tt = LDW(&(W[TWVL * 38]));
115 Tv = VMUL(Tt, Tu);
116 T3G = VMUL(Tt, Tx);
117 Tw = LDW(&(W[TWVL * 39]));
118 Ty = VFMA(Tw, Tx, Tv);
119 T3H = VFNMS(Tw, Tu, T3G);
120 }
121 Tz = VADD(Ts, Ty);
122 T6F = VADD(T3F, T3H);
123 T3D = VSUB(Ts, Ty);
124 T3I = VSUB(T3F, T3H);
125 T3J = VADD(T3D, T3I);
126 T5T = VSUB(T3I, T3D);
127 }
128 {
129 V TF, T3M, TL, T3O, T3K, T3P;
130 {
131 V TB, TE, TC, T3L, TA, TD;
132 TB = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
133 TE = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
134 TA = LDW(&(W[TWVL * 54]));
135 TC = VMUL(TA, TB);
136 T3L = VMUL(TA, TE);
137 TD = LDW(&(W[TWVL * 55]));
138 TF = VFMA(TD, TE, TC);
139 T3M = VFNMS(TD, TB, T3L);
140 }
141 {
142 V TH, TK, TI, T3N, TG, TJ;
143 TH = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
144 TK = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
145 TG = LDW(&(W[TWVL * 22]));
146 TI = VMUL(TG, TH);
147 T3N = VMUL(TG, TK);
148 TJ = LDW(&(W[TWVL * 23]));
149 TL = VFMA(TJ, TK, TI);
150 T3O = VFNMS(TJ, TH, T3N);
151 }
152 TM = VADD(TF, TL);
153 T6G = VADD(T3M, T3O);
154 T3K = VSUB(TF, TL);
155 T3P = VSUB(T3M, T3O);
156 T3Q = VSUB(T3K, T3P);
157 T5U = VADD(T3K, T3P);
158 }
159 {
160 V TU, T3U, T1d, T44, T10, T3W, T17, T42;
161 {
162 V TQ, TT, TR, T3T, TP, TS;
163 TQ = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
164 TT = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
165 TP = LDW(&(W[TWVL * 2]));
166 TR = VMUL(TP, TQ);
167 T3T = VMUL(TP, TT);
168 TS = LDW(&(W[TWVL * 3]));
169 TU = VFMA(TS, TT, TR);
170 T3U = VFNMS(TS, TQ, T3T);
171 }
172 {
173 V T19, T1c, T1a, T43, T18, T1b;
174 T19 = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
175 T1c = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
176 T18 = LDW(&(W[TWVL * 50]));
177 T1a = VMUL(T18, T19);
178 T43 = VMUL(T18, T1c);
179 T1b = LDW(&(W[TWVL * 51]));
180 T1d = VFMA(T1b, T1c, T1a);
181 T44 = VFNMS(T1b, T19, T43);
182 }
183 {
184 V TW, TZ, TX, T3V, TV, TY;
185 TW = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
186 TZ = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
187 TV = LDW(&(W[TWVL * 34]));
188 TX = VMUL(TV, TW);
189 T3V = VMUL(TV, TZ);
190 TY = LDW(&(W[TWVL * 35]));
191 T10 = VFMA(TY, TZ, TX);
192 T3W = VFNMS(TY, TW, T3V);
193 }
194 {
195 V T13, T16, T14, T41, T12, T15;
196 T13 = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
197 T16 = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
198 T12 = LDW(&(W[TWVL * 18]));
199 T14 = VMUL(T12, T13);
200 T41 = VMUL(T12, T16);
201 T15 = LDW(&(W[TWVL * 19]));
202 T17 = VFMA(T15, T16, T14);
203 T42 = VFNMS(T15, T13, T41);
204 }
205 T11 = VADD(TU, T10);
206 T1e = VADD(T17, T1d);
207 T6M = VSUB(T11, T1e);
208 T6J = VADD(T3U, T3W);
209 T6K = VADD(T42, T44);
210 T6L = VSUB(T6J, T6K);
211 {
212 V T3X, T3Y, T40, T45;
213 T3X = VSUB(T3U, T3W);
214 T3Y = VSUB(T17, T1d);
215 T3Z = VSUB(T3X, T3Y);
216 T5X = VADD(T3X, T3Y);
217 T40 = VSUB(TU, T10);
218 T45 = VSUB(T42, T44);
219 T46 = VADD(T40, T45);
220 T5Y = VSUB(T40, T45);
221 }
222 }
223 {
224 V T1l, T49, T1E, T4j, T1r, T4b, T1y, T4h;
225 {
226 V T1h, T1k, T1i, T48, T1g, T1j;
227 T1h = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
228 T1k = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
229 T1g = LDW(&(W[TWVL * 58]));
230 T1i = VMUL(T1g, T1h);
231 T48 = VMUL(T1g, T1k);
232 T1j = LDW(&(W[TWVL * 59]));
233 T1l = VFMA(T1j, T1k, T1i);
234 T49 = VFNMS(T1j, T1h, T48);
235 }
236 {
237 V T1A, T1D, T1B, T4i, T1z, T1C;
238 T1A = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
239 T1D = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
240 T1z = LDW(&(W[TWVL * 42]));
241 T1B = VMUL(T1z, T1A);
242 T4i = VMUL(T1z, T1D);
243 T1C = LDW(&(W[TWVL * 43]));
244 T1E = VFMA(T1C, T1D, T1B);
245 T4j = VFNMS(T1C, T1A, T4i);
246 }
247 {
248 V T1n, T1q, T1o, T4a, T1m, T1p;
249 T1n = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
250 T1q = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
251 T1m = LDW(&(W[TWVL * 26]));
252 T1o = VMUL(T1m, T1n);
253 T4a = VMUL(T1m, T1q);
254 T1p = LDW(&(W[TWVL * 27]));
255 T1r = VFMA(T1p, T1q, T1o);
256 T4b = VFNMS(T1p, T1n, T4a);
257 }
258 {
259 V T1u, T1x, T1v, T4g, T1t, T1w;
260 T1u = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
261 T1x = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
262 T1t = LDW(&(W[TWVL * 10]));
263 T1v = VMUL(T1t, T1u);
264 T4g = VMUL(T1t, T1x);
265 T1w = LDW(&(W[TWVL * 11]));
266 T1y = VFMA(T1w, T1x, T1v);
267 T4h = VFNMS(T1w, T1u, T4g);
268 }
269 T1s = VADD(T1l, T1r);
270 T1F = VADD(T1y, T1E);
271 T6O = VSUB(T1s, T1F);
272 T6P = VADD(T49, T4b);
273 T6Q = VADD(T4h, T4j);
274 T6R = VSUB(T6P, T6Q);
275 {
276 V T4c, T4d, T4f, T4k;
277 T4c = VSUB(T49, T4b);
278 T4d = VSUB(T1y, T1E);
279 T4e = VSUB(T4c, T4d);
280 T60 = VADD(T4c, T4d);
281 T4f = VSUB(T1l, T1r);
282 T4k = VSUB(T4h, T4j);
283 T4l = VADD(T4f, T4k);
284 T61 = VSUB(T4f, T4k);
285 }
286 }
287 {
288 V T2H, T4Z, T30, T5p, T2N, T51, T2U, T5n;
289 {
290 V T2D, T2G, T2E, T4Y, T2C, T2F;
291 T2D = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
292 T2G = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
293 T2C = LDW(&(W[TWVL * 60]));
294 T2E = VMUL(T2C, T2D);
295 T4Y = VMUL(T2C, T2G);
296 T2F = LDW(&(W[TWVL * 61]));
297 T2H = VFMA(T2F, T2G, T2E);
298 T4Z = VFNMS(T2F, T2D, T4Y);
299 }
300 {
301 V T2W, T2Z, T2X, T5o, T2V, T2Y;
302 T2W = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
303 T2Z = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
304 T2V = LDW(&(W[TWVL * 44]));
305 T2X = VMUL(T2V, T2W);
306 T5o = VMUL(T2V, T2Z);
307 T2Y = LDW(&(W[TWVL * 45]));
308 T30 = VFMA(T2Y, T2Z, T2X);
309 T5p = VFNMS(T2Y, T2W, T5o);
310 }
311 {
312 V T2J, T2M, T2K, T50, T2I, T2L;
313 T2J = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
314 T2M = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
315 T2I = LDW(&(W[TWVL * 28]));
316 T2K = VMUL(T2I, T2J);
317 T50 = VMUL(T2I, T2M);
318 T2L = LDW(&(W[TWVL * 29]));
319 T2N = VFMA(T2L, T2M, T2K);
320 T51 = VFNMS(T2L, T2J, T50);
321 }
322 {
323 V T2Q, T2T, T2R, T5m, T2P, T2S;
324 T2Q = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
325 T2T = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
326 T2P = LDW(&(W[TWVL * 12]));
327 T2R = VMUL(T2P, T2Q);
328 T5m = VMUL(T2P, T2T);
329 T2S = LDW(&(W[TWVL * 13]));
330 T2U = VFMA(T2S, T2T, T2R);
331 T5n = VFNMS(T2S, T2Q, T5m);
332 }
333 {
334 V T2O, T31, T76, T77;
335 T2O = VADD(T2H, T2N);
336 T31 = VADD(T2U, T30);
337 T32 = VADD(T2O, T31);
338 T7b = VSUB(T2O, T31);
339 T76 = VADD(T4Z, T51);
340 T77 = VADD(T5n, T5p);
341 T78 = VSUB(T76, T77);
342 T7N = VADD(T76, T77);
343 }
344 {
345 V T52, T53, T5l, T5q;
346 T52 = VSUB(T4Z, T51);
347 T53 = VSUB(T2U, T30);
348 T54 = VSUB(T52, T53);
349 T6f = VADD(T52, T53);
350 T5l = VSUB(T2H, T2N);
351 T5q = VSUB(T5n, T5p);
352 T5r = VADD(T5l, T5q);
353 T6c = VSUB(T5l, T5q);
354 }
355 }
356 {
357 V T1O, T4q, T27, T4Q, T1U, T4s, T21, T4O;
358 {
359 V T1K, T1N, T1L, T4p, T1J, T1M;
360 T1K = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
361 T1N = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
362 T1J = LDW(&(W[0]));
363 T1L = VMUL(T1J, T1K);
364 T4p = VMUL(T1J, T1N);
365 T1M = LDW(&(W[TWVL * 1]));
366 T1O = VFMA(T1M, T1N, T1L);
367 T4q = VFNMS(T1M, T1K, T4p);
368 }
369 {
370 V T23, T26, T24, T4P, T22, T25;
371 T23 = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
372 T26 = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
373 T22 = LDW(&(W[TWVL * 48]));
374 T24 = VMUL(T22, T23);
375 T4P = VMUL(T22, T26);
376 T25 = LDW(&(W[TWVL * 49]));
377 T27 = VFMA(T25, T26, T24);
378 T4Q = VFNMS(T25, T23, T4P);
379 }
380 {
381 V T1Q, T1T, T1R, T4r, T1P, T1S;
382 T1Q = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
383 T1T = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
384 T1P = LDW(&(W[TWVL * 32]));
385 T1R = VMUL(T1P, T1Q);
386 T4r = VMUL(T1P, T1T);
387 T1S = LDW(&(W[TWVL * 33]));
388 T1U = VFMA(T1S, T1T, T1R);
389 T4s = VFNMS(T1S, T1Q, T4r);
390 }
391 {
392 V T1X, T20, T1Y, T4N, T1W, T1Z;
393 T1X = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
394 T20 = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
395 T1W = LDW(&(W[TWVL * 16]));
396 T1Y = VMUL(T1W, T1X);
397 T4N = VMUL(T1W, T20);
398 T1Z = LDW(&(W[TWVL * 17]));
399 T21 = VFMA(T1Z, T20, T1Y);
400 T4O = VFNMS(T1Z, T1X, T4N);
401 }
402 {
403 V T1V, T28, T6V, T6W;
404 T1V = VADD(T1O, T1U);
405 T28 = VADD(T21, T27);
406 T29 = VADD(T1V, T28);
407 T70 = VSUB(T1V, T28);
408 T6V = VADD(T4q, T4s);
409 T6W = VADD(T4O, T4Q);
410 T6X = VSUB(T6V, T6W);
411 T7I = VADD(T6V, T6W);
412 }
413 {
414 V T4t, T4u, T4M, T4R;
415 T4t = VSUB(T4q, T4s);
416 T4u = VSUB(T21, T27);
417 T4v = VSUB(T4t, T4u);
418 T68 = VADD(T4t, T4u);
419 T4M = VSUB(T1O, T1U);
420 T4R = VSUB(T4O, T4Q);
421 T4S = VADD(T4M, T4R);
422 T65 = VSUB(T4M, T4R);
423 }
424 }
425 {
426 V T38, T56, T3r, T5g, T3e, T58, T3l, T5e;
427 {
428 V T34, T37, T35, T55, T33, T36;
429 T34 = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
430 T37 = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
431 T33 = LDW(&(W[TWVL * 4]));
432 T35 = VMUL(T33, T34);
433 T55 = VMUL(T33, T37);
434 T36 = LDW(&(W[TWVL * 5]));
435 T38 = VFMA(T36, T37, T35);
436 T56 = VFNMS(T36, T34, T55);
437 }
438 {
439 V T3n, T3q, T3o, T5f, T3m, T3p;
440 T3n = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
441 T3q = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
442 T3m = LDW(&(W[TWVL * 20]));
443 T3o = VMUL(T3m, T3n);
444 T5f = VMUL(T3m, T3q);
445 T3p = LDW(&(W[TWVL * 21]));
446 T3r = VFMA(T3p, T3q, T3o);
447 T5g = VFNMS(T3p, T3n, T5f);
448 }
449 {
450 V T3a, T3d, T3b, T57, T39, T3c;
451 T3a = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
452 T3d = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
453 T39 = LDW(&(W[TWVL * 36]));
454 T3b = VMUL(T39, T3a);
455 T57 = VMUL(T39, T3d);
456 T3c = LDW(&(W[TWVL * 37]));
457 T3e = VFMA(T3c, T3d, T3b);
458 T58 = VFNMS(T3c, T3a, T57);
459 }
460 {
461 V T3h, T3k, T3i, T5d, T3g, T3j;
462 T3h = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
463 T3k = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
464 T3g = LDW(&(W[TWVL * 52]));
465 T3i = VMUL(T3g, T3h);
466 T5d = VMUL(T3g, T3k);
467 T3j = LDW(&(W[TWVL * 53]));
468 T3l = VFMA(T3j, T3k, T3i);
469 T5e = VFNMS(T3j, T3h, T5d);
470 }
471 {
472 V T3f, T3s, T7c, T7d;
473 T3f = VADD(T38, T3e);
474 T3s = VADD(T3l, T3r);
475 T3t = VADD(T3f, T3s);
476 T79 = VSUB(T3s, T3f);
477 T7c = VADD(T56, T58);
478 T7d = VADD(T5e, T5g);
479 T7e = VSUB(T7c, T7d);
480 T7O = VADD(T7c, T7d);
481 }
482 {
483 V T59, T5a, T5c, T5h;
484 T59 = VSUB(T56, T58);
485 T5a = VSUB(T38, T3e);
486 T5b = VSUB(T59, T5a);
487 T5s = VADD(T5a, T59);
488 T5c = VSUB(T3l, T3r);
489 T5h = VSUB(T5e, T5g);
490 T5i = VADD(T5c, T5h);
491 T5t = VSUB(T5c, T5h);
492 }
493 }
494 {
495 V T2f, T4x, T2y, T4H, T2l, T4z, T2s, T4F;
496 {
497 V T2b, T2e, T2c, T4w, T2a, T2d;
498 T2b = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
499 T2e = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
500 T2a = LDW(&(W[TWVL * 8]));
501 T2c = VMUL(T2a, T2b);
502 T4w = VMUL(T2a, T2e);
503 T2d = LDW(&(W[TWVL * 9]));
504 T2f = VFMA(T2d, T2e, T2c);
505 T4x = VFNMS(T2d, T2b, T4w);
506 }
507 {
508 V T2u, T2x, T2v, T4G, T2t, T2w;
509 T2u = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
510 T2x = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
511 T2t = LDW(&(W[TWVL * 24]));
512 T2v = VMUL(T2t, T2u);
513 T4G = VMUL(T2t, T2x);
514 T2w = LDW(&(W[TWVL * 25]));
515 T2y = VFMA(T2w, T2x, T2v);
516 T4H = VFNMS(T2w, T2u, T4G);
517 }
518 {
519 V T2h, T2k, T2i, T4y, T2g, T2j;
520 T2h = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
521 T2k = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
522 T2g = LDW(&(W[TWVL * 40]));
523 T2i = VMUL(T2g, T2h);
524 T4y = VMUL(T2g, T2k);
525 T2j = LDW(&(W[TWVL * 41]));
526 T2l = VFMA(T2j, T2k, T2i);
527 T4z = VFNMS(T2j, T2h, T4y);
528 }
529 {
530 V T2o, T2r, T2p, T4E, T2n, T2q;
531 T2o = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
532 T2r = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
533 T2n = LDW(&(W[TWVL * 56]));
534 T2p = VMUL(T2n, T2o);
535 T4E = VMUL(T2n, T2r);
536 T2q = LDW(&(W[TWVL * 57]));
537 T2s = VFMA(T2q, T2r, T2p);
538 T4F = VFNMS(T2q, T2o, T4E);
539 }
540 {
541 V T2m, T2z, T71, T72;
542 T2m = VADD(T2f, T2l);
543 T2z = VADD(T2s, T2y);
544 T2A = VADD(T2m, T2z);
545 T6Y = VSUB(T2z, T2m);
546 T71 = VADD(T4x, T4z);
547 T72 = VADD(T4F, T4H);
548 T73 = VSUB(T71, T72);
549 T7J = VADD(T71, T72);
550 }
551 {
552 V T4A, T4B, T4D, T4I;
553 T4A = VSUB(T4x, T4z);
554 T4B = VSUB(T2f, T2l);
555 T4C = VSUB(T4A, T4B);
556 T4T = VADD(T4B, T4A);
557 T4D = VSUB(T2s, T2y);
558 T4I = VSUB(T4F, T4H);
559 T4J = VADD(T4D, T4I);
560 T4U = VSUB(T4D, T4I);
561 }
562 }
563 {
564 V TO, T7C, T7Z, T80, T89, T8e, T1H, T8d, T3v, T8b, T7L, T7T, T7Q, T7U, T7F;
565 V T81;
566 {
567 V Tm, TN, T7X, T7Y;
568 Tm = VADD(T8, Tl);
569 TN = VADD(Tz, TM);
570 TO = VADD(Tm, TN);
571 T7C = VSUB(Tm, TN);
572 T7X = VADD(T7I, T7J);
573 T7Y = VADD(T7N, T7O);
574 T7Z = VSUB(T7X, T7Y);
575 T80 = VADD(T7X, T7Y);
576 }
577 {
578 V T82, T88, T1f, T1G;
579 T82 = VADD(T6F, T6G);
580 T88 = VADD(T83, T87);
581 T89 = VADD(T82, T88);
582 T8e = VSUB(T88, T82);
583 T1f = VADD(T11, T1e);
584 T1G = VADD(T1s, T1F);
585 T1H = VADD(T1f, T1G);
586 T8d = VSUB(T1G, T1f);
587 }
588 {
589 V T2B, T3u, T7H, T7K;
590 T2B = VADD(T29, T2A);
591 T3u = VADD(T32, T3t);
592 T3v = VADD(T2B, T3u);
593 T8b = VSUB(T3u, T2B);
594 T7H = VSUB(T29, T2A);
595 T7K = VSUB(T7I, T7J);
596 T7L = VADD(T7H, T7K);
597 T7T = VSUB(T7K, T7H);
598 }
599 {
600 V T7M, T7P, T7D, T7E;
601 T7M = VSUB(T32, T3t);
602 T7P = VSUB(T7N, T7O);
603 T7Q = VSUB(T7M, T7P);
604 T7U = VADD(T7M, T7P);
605 T7D = VADD(T6J, T6K);
606 T7E = VADD(T6P, T6Q);
607 T7F = VSUB(T7D, T7E);
608 T81 = VADD(T7D, T7E);
609 }
610 {
611 V T1I, T8a, T7W, T8c;
612 T1I = VADD(TO, T1H);
613 ST(&(ri[WS(rs, 16)]), VSUB(T1I, T3v), ms, &(ri[0]));
614 ST(&(ri[0]), VADD(T1I, T3v), ms, &(ri[0]));
615 T8a = VADD(T81, T89);
616 ST(&(ii[0]), VADD(T80, T8a), ms, &(ii[0]));
617 ST(&(ii[WS(rs, 16)]), VSUB(T8a, T80), ms, &(ii[0]));
618 T7W = VSUB(TO, T1H);
619 ST(&(ri[WS(rs, 24)]), VSUB(T7W, T7Z), ms, &(ri[0]));
620 ST(&(ri[WS(rs, 8)]), VADD(T7W, T7Z), ms, &(ri[0]));
621 T8c = VSUB(T89, T81);
622 ST(&(ii[WS(rs, 8)]), VADD(T8b, T8c), ms, &(ii[0]));
623 ST(&(ii[WS(rs, 24)]), VSUB(T8c, T8b), ms, &(ii[0]));
624 }
625 {
626 V T7G, T7R, T8f, T8g;
627 T7G = VADD(T7C, T7F);
628 T7R = VADD(T7L, T7Q);
629 ST(&(ri[WS(rs, 20)]), VFNMS(LDK(KP707106781), T7R, T7G), ms, &(ri[0]));
630 ST(&(ri[WS(rs, 4)]), VFMA(LDK(KP707106781), T7R, T7G), ms, &(ri[0]));
631 T8f = VADD(T8d, T8e);
632 T8g = VADD(T7T, T7U);
633 ST(&(ii[WS(rs, 4)]), VFMA(LDK(KP707106781), T8g, T8f), ms, &(ii[0]));
634 ST(&(ii[WS(rs, 20)]), VFNMS(LDK(KP707106781), T8g, T8f), ms, &(ii[0]));
635 }
636 {
637 V T7S, T7V, T8h, T8i;
638 T7S = VSUB(T7C, T7F);
639 T7V = VSUB(T7T, T7U);
640 ST(&(ri[WS(rs, 28)]), VFNMS(LDK(KP707106781), T7V, T7S), ms, &(ri[0]));
641 ST(&(ri[WS(rs, 12)]), VFMA(LDK(KP707106781), T7V, T7S), ms, &(ri[0]));
642 T8h = VSUB(T8e, T8d);
643 T8i = VSUB(T7Q, T7L);
644 ST(&(ii[WS(rs, 12)]), VFMA(LDK(KP707106781), T8i, T8h), ms, &(ii[0]));
645 ST(&(ii[WS(rs, 28)]), VFNMS(LDK(KP707106781), T8i, T8h), ms, &(ii[0]));
646 }
647 }
648 {
649 V T6I, T7m, T7w, T7A, T8l, T8r, T6T, T8m, T75, T7j, T7p, T8s, T7t, T7z, T7g;
650 V T7k;
651 {
652 V T6E, T6H, T7u, T7v;
653 T6E = VSUB(T8, Tl);
654 T6H = VSUB(T6F, T6G);
655 T6I = VSUB(T6E, T6H);
656 T7m = VADD(T6E, T6H);
657 T7u = VADD(T7b, T7e);
658 T7v = VADD(T78, T79);
659 T7w = VFNMS(LDK(KP414213562), T7v, T7u);
660 T7A = VFMA(LDK(KP414213562), T7u, T7v);
661 }
662 {
663 V T8j, T8k, T6N, T6S;
664 T8j = VSUB(TM, Tz);
665 T8k = VSUB(T87, T83);
666 T8l = VADD(T8j, T8k);
667 T8r = VSUB(T8k, T8j);
668 T6N = VSUB(T6L, T6M);
669 T6S = VADD(T6O, T6R);
670 T6T = VSUB(T6N, T6S);
671 T8m = VADD(T6N, T6S);
672 }
673 {
674 V T6Z, T74, T7n, T7o;
675 T6Z = VSUB(T6X, T6Y);
676 T74 = VSUB(T70, T73);
677 T75 = VFMA(LDK(KP414213562), T74, T6Z);
678 T7j = VFNMS(LDK(KP414213562), T6Z, T74);
679 T7n = VADD(T6M, T6L);
680 T7o = VSUB(T6O, T6R);
681 T7p = VADD(T7n, T7o);
682 T8s = VSUB(T7o, T7n);
683 }
684 {
685 V T7r, T7s, T7a, T7f;
686 T7r = VADD(T70, T73);
687 T7s = VADD(T6X, T6Y);
688 T7t = VFMA(LDK(KP414213562), T7s, T7r);
689 T7z = VFNMS(LDK(KP414213562), T7r, T7s);
690 T7a = VSUB(T78, T79);
691 T7f = VSUB(T7b, T7e);
692 T7g = VFNMS(LDK(KP414213562), T7f, T7a);
693 T7k = VFMA(LDK(KP414213562), T7a, T7f);
694 }
695 {
696 V T6U, T7h, T8t, T8u;
697 T6U = VFMA(LDK(KP707106781), T6T, T6I);
698 T7h = VSUB(T75, T7g);
699 ST(&(ri[WS(rs, 22)]), VFNMS(LDK(KP923879532), T7h, T6U), ms, &(ri[0]));
700 ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP923879532), T7h, T6U), ms, &(ri[0]));
701 T8t = VFMA(LDK(KP707106781), T8s, T8r);
702 T8u = VSUB(T7k, T7j);
703 ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP923879532), T8u, T8t), ms, &(ii[0]));
704 ST(&(ii[WS(rs, 22)]), VFNMS(LDK(KP923879532), T8u, T8t), ms, &(ii[0]));
705 }
706 {
707 V T7i, T7l, T8v, T8w;
708 T7i = VFNMS(LDK(KP707106781), T6T, T6I);
709 T7l = VADD(T7j, T7k);
710 ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP923879532), T7l, T7i), ms, &(ri[0]));
711 ST(&(ri[WS(rs, 30)]), VFMA(LDK(KP923879532), T7l, T7i), ms, &(ri[0]));
712 T8v = VFNMS(LDK(KP707106781), T8s, T8r);
713 T8w = VADD(T75, T7g);
714 ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP923879532), T8w, T8v), ms, &(ii[0]));
715 ST(&(ii[WS(rs, 30)]), VFMA(LDK(KP923879532), T8w, T8v), ms, &(ii[0]));
716 }
717 {
718 V T7q, T7x, T8n, T8o;
719 T7q = VFMA(LDK(KP707106781), T7p, T7m);
720 T7x = VADD(T7t, T7w);
721 ST(&(ri[WS(rs, 18)]), VFNMS(LDK(KP923879532), T7x, T7q), ms, &(ri[0]));
722 ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP923879532), T7x, T7q), ms, &(ri[0]));
723 T8n = VFMA(LDK(KP707106781), T8m, T8l);
724 T8o = VADD(T7z, T7A);
725 ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP923879532), T8o, T8n), ms, &(ii[0]));
726 ST(&(ii[WS(rs, 18)]), VFNMS(LDK(KP923879532), T8o, T8n), ms, &(ii[0]));
727 }
728 {
729 V T7y, T7B, T8p, T8q;
730 T7y = VFNMS(LDK(KP707106781), T7p, T7m);
731 T7B = VSUB(T7z, T7A);
732 ST(&(ri[WS(rs, 26)]), VFNMS(LDK(KP923879532), T7B, T7y), ms, &(ri[0]));
733 ST(&(ri[WS(rs, 10)]), VFMA(LDK(KP923879532), T7B, T7y), ms, &(ri[0]));
734 T8p = VFNMS(LDK(KP707106781), T8m, T8l);
735 T8q = VSUB(T7w, T7t);
736 ST(&(ii[WS(rs, 10)]), VFMA(LDK(KP923879532), T8q, T8p), ms, &(ii[0]));
737 ST(&(ii[WS(rs, 26)]), VFNMS(LDK(KP923879532), T8q, T8p), ms, &(ii[0]));
738 }
739 }
740 {
741 V T3S, T5C, T4n, T8C, T8B, T8H, T5F, T8I, T5w, T5Q, T5A, T5M, T4X, T5P, T5z;
742 V T5J;
743 {
744 V T3C, T3R, T5D, T5E;
745 T3C = VADD(T3w, T3B);
746 T3R = VADD(T3J, T3Q);
747 T3S = VFNMS(LDK(KP707106781), T3R, T3C);
748 T5C = VFMA(LDK(KP707106781), T3R, T3C);
749 {
750 V T47, T4m, T8z, T8A;
751 T47 = VFNMS(LDK(KP414213562), T46, T3Z);
752 T4m = VFMA(LDK(KP414213562), T4l, T4e);
753 T4n = VSUB(T47, T4m);
754 T8C = VADD(T47, T4m);
755 T8z = VSUB(T8x, T8y);
756 T8A = VADD(T5T, T5U);
757 T8B = VFMA(LDK(KP707106781), T8A, T8z);
758 T8H = VFNMS(LDK(KP707106781), T8A, T8z);
759 }
760 T5D = VFMA(LDK(KP414213562), T3Z, T46);
761 T5E = VFNMS(LDK(KP414213562), T4e, T4l);
762 T5F = VADD(T5D, T5E);
763 T8I = VSUB(T5E, T5D);
764 {
765 V T5k, T5L, T5v, T5K, T5j, T5u;
766 T5j = VADD(T5b, T5i);
767 T5k = VFNMS(LDK(KP707106781), T5j, T54);
768 T5L = VFMA(LDK(KP707106781), T5j, T54);
769 T5u = VADD(T5s, T5t);
770 T5v = VFNMS(LDK(KP707106781), T5u, T5r);
771 T5K = VFMA(LDK(KP707106781), T5u, T5r);
772 T5w = VFNMS(LDK(KP668178637), T5v, T5k);
773 T5Q = VFMA(LDK(KP198912367), T5K, T5L);
774 T5A = VFMA(LDK(KP668178637), T5k, T5v);
775 T5M = VFNMS(LDK(KP198912367), T5L, T5K);
776 }
777 {
778 V T4L, T5I, T4W, T5H, T4K, T4V;
779 T4K = VADD(T4C, T4J);
780 T4L = VFNMS(LDK(KP707106781), T4K, T4v);
781 T5I = VFMA(LDK(KP707106781), T4K, T4v);
782 T4V = VADD(T4T, T4U);
783 T4W = VFNMS(LDK(KP707106781), T4V, T4S);
784 T5H = VFMA(LDK(KP707106781), T4V, T4S);
785 T4X = VFMA(LDK(KP668178637), T4W, T4L);
786 T5P = VFNMS(LDK(KP198912367), T5H, T5I);
787 T5z = VFNMS(LDK(KP668178637), T4L, T4W);
788 T5J = VFMA(LDK(KP198912367), T5I, T5H);
789 }
790 }
791 {
792 V T4o, T5x, T8J, T8K;
793 T4o = VFMA(LDK(KP923879532), T4n, T3S);
794 T5x = VSUB(T4X, T5w);
795 ST(&(ri[WS(rs, 21)]), VFNMS(LDK(KP831469612), T5x, T4o), ms, &(ri[WS(rs, 1)]));
796 ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP831469612), T5x, T4o), ms, &(ri[WS(rs, 1)]));
797 T8J = VFMA(LDK(KP923879532), T8I, T8H);
798 T8K = VSUB(T5A, T5z);
799 ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP831469612), T8K, T8J), ms, &(ii[WS(rs, 1)]));
800 ST(&(ii[WS(rs, 21)]), VFNMS(LDK(KP831469612), T8K, T8J), ms, &(ii[WS(rs, 1)]));
801 }
802 {
803 V T5y, T5B, T8L, T8M;
804 T5y = VFNMS(LDK(KP923879532), T4n, T3S);
805 T5B = VADD(T5z, T5A);
806 ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP831469612), T5B, T5y), ms, &(ri[WS(rs, 1)]));
807 ST(&(ri[WS(rs, 29)]), VFMA(LDK(KP831469612), T5B, T5y), ms, &(ri[WS(rs, 1)]));
808 T8L = VFNMS(LDK(KP923879532), T8I, T8H);
809 T8M = VADD(T4X, T5w);
810 ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP831469612), T8M, T8L), ms, &(ii[WS(rs, 1)]));
811 ST(&(ii[WS(rs, 29)]), VFMA(LDK(KP831469612), T8M, T8L), ms, &(ii[WS(rs, 1)]));
812 }
813 {
814 V T5G, T5N, T8D, T8E;
815 T5G = VFMA(LDK(KP923879532), T5F, T5C);
816 T5N = VADD(T5J, T5M);
817 ST(&(ri[WS(rs, 17)]), VFNMS(LDK(KP980785280), T5N, T5G), ms, &(ri[WS(rs, 1)]));
818 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP980785280), T5N, T5G), ms, &(ri[WS(rs, 1)]));
819 T8D = VFMA(LDK(KP923879532), T8C, T8B);
820 T8E = VADD(T5P, T5Q);
821 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP980785280), T8E, T8D), ms, &(ii[WS(rs, 1)]));
822 ST(&(ii[WS(rs, 17)]), VFNMS(LDK(KP980785280), T8E, T8D), ms, &(ii[WS(rs, 1)]));
823 }
824 {
825 V T5O, T5R, T8F, T8G;
826 T5O = VFNMS(LDK(KP923879532), T5F, T5C);
827 T5R = VSUB(T5P, T5Q);
828 ST(&(ri[WS(rs, 25)]), VFNMS(LDK(KP980785280), T5R, T5O), ms, &(ri[WS(rs, 1)]));
829 ST(&(ri[WS(rs, 9)]), VFMA(LDK(KP980785280), T5R, T5O), ms, &(ri[WS(rs, 1)]));
830 T8F = VFNMS(LDK(KP923879532), T8C, T8B);
831 T8G = VSUB(T5M, T5J);
832 ST(&(ii[WS(rs, 9)]), VFMA(LDK(KP980785280), T8G, T8F), ms, &(ii[WS(rs, 1)]));
833 ST(&(ii[WS(rs, 25)]), VFNMS(LDK(KP980785280), T8G, T8F), ms, &(ii[WS(rs, 1)]));
834 }
835 }
836 {
837 V T5W, T6o, T63, T8W, T8P, T8V, T6r, T8Q, T6i, T6C, T6m, T6y, T6b, T6B, T6l;
838 V T6v;
839 {
840 V T5S, T5V, T6p, T6q;
841 T5S = VSUB(T3w, T3B);
842 T5V = VSUB(T5T, T5U);
843 T5W = VFMA(LDK(KP707106781), T5V, T5S);
844 T6o = VFNMS(LDK(KP707106781), T5V, T5S);
845 {
846 V T5Z, T62, T8N, T8O;
847 T5Z = VFMA(LDK(KP414213562), T5Y, T5X);
848 T62 = VFNMS(LDK(KP414213562), T61, T60);
849 T63 = VSUB(T5Z, T62);
850 T8W = VADD(T5Z, T62);
851 T8N = VADD(T8y, T8x);
852 T8O = VSUB(T3Q, T3J);
853 T8P = VFMA(LDK(KP707106781), T8O, T8N);
854 T8V = VFNMS(LDK(KP707106781), T8O, T8N);
855 }
856 T6p = VFNMS(LDK(KP414213562), T5X, T5Y);
857 T6q = VFMA(LDK(KP414213562), T60, T61);
858 T6r = VADD(T6p, T6q);
859 T8Q = VSUB(T6q, T6p);
860 {
861 V T6e, T6x, T6h, T6w, T6d, T6g;
862 T6d = VSUB(T5i, T5b);
863 T6e = VFNMS(LDK(KP707106781), T6d, T6c);
864 T6x = VFMA(LDK(KP707106781), T6d, T6c);
865 T6g = VSUB(T5s, T5t);
866 T6h = VFNMS(LDK(KP707106781), T6g, T6f);
867 T6w = VFMA(LDK(KP707106781), T6g, T6f);
868 T6i = VFNMS(LDK(KP668178637), T6h, T6e);
869 T6C = VFMA(LDK(KP198912367), T6w, T6x);
870 T6m = VFMA(LDK(KP668178637), T6e, T6h);
871 T6y = VFNMS(LDK(KP198912367), T6x, T6w);
872 }
873 {
874 V T67, T6u, T6a, T6t, T66, T69;
875 T66 = VSUB(T4J, T4C);
876 T67 = VFNMS(LDK(KP707106781), T66, T65);
877 T6u = VFMA(LDK(KP707106781), T66, T65);
878 T69 = VSUB(T4T, T4U);
879 T6a = VFNMS(LDK(KP707106781), T69, T68);
880 T6t = VFMA(LDK(KP707106781), T69, T68);
881 T6b = VFMA(LDK(KP668178637), T6a, T67);
882 T6B = VFNMS(LDK(KP198912367), T6t, T6u);
883 T6l = VFNMS(LDK(KP668178637), T67, T6a);
884 T6v = VFMA(LDK(KP198912367), T6u, T6t);
885 }
886 }
887 {
888 V T64, T6j, T8R, T8S;
889 T64 = VFMA(LDK(KP923879532), T63, T5W);
890 T6j = VADD(T6b, T6i);
891 ST(&(ri[WS(rs, 19)]), VFNMS(LDK(KP831469612), T6j, T64), ms, &(ri[WS(rs, 1)]));
892 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP831469612), T6j, T64), ms, &(ri[WS(rs, 1)]));
893 T8R = VFMA(LDK(KP923879532), T8Q, T8P);
894 T8S = VADD(T6l, T6m);
895 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP831469612), T8S, T8R), ms, &(ii[WS(rs, 1)]));
896 ST(&(ii[WS(rs, 19)]), VFNMS(LDK(KP831469612), T8S, T8R), ms, &(ii[WS(rs, 1)]));
897 }
898 {
899 V T6k, T6n, T8T, T8U;
900 T6k = VFNMS(LDK(KP923879532), T63, T5W);
901 T6n = VSUB(T6l, T6m);
902 ST(&(ri[WS(rs, 27)]), VFNMS(LDK(KP831469612), T6n, T6k), ms, &(ri[WS(rs, 1)]));
903 ST(&(ri[WS(rs, 11)]), VFMA(LDK(KP831469612), T6n, T6k), ms, &(ri[WS(rs, 1)]));
904 T8T = VFNMS(LDK(KP923879532), T8Q, T8P);
905 T8U = VSUB(T6i, T6b);
906 ST(&(ii[WS(rs, 11)]), VFMA(LDK(KP831469612), T8U, T8T), ms, &(ii[WS(rs, 1)]));
907 ST(&(ii[WS(rs, 27)]), VFNMS(LDK(KP831469612), T8U, T8T), ms, &(ii[WS(rs, 1)]));
908 }
909 {
910 V T6s, T6z, T8X, T8Y;
911 T6s = VFNMS(LDK(KP923879532), T6r, T6o);
912 T6z = VSUB(T6v, T6y);
913 ST(&(ri[WS(rs, 23)]), VFNMS(LDK(KP980785280), T6z, T6s), ms, &(ri[WS(rs, 1)]));
914 ST(&(ri[WS(rs, 7)]), VFMA(LDK(KP980785280), T6z, T6s), ms, &(ri[WS(rs, 1)]));
915 T8X = VFNMS(LDK(KP923879532), T8W, T8V);
916 T8Y = VSUB(T6C, T6B);
917 ST(&(ii[WS(rs, 7)]), VFMA(LDK(KP980785280), T8Y, T8X), ms, &(ii[WS(rs, 1)]));
918 ST(&(ii[WS(rs, 23)]), VFNMS(LDK(KP980785280), T8Y, T8X), ms, &(ii[WS(rs, 1)]));
919 }
920 {
921 V T6A, T6D, T8Z, T90;
922 T6A = VFMA(LDK(KP923879532), T6r, T6o);
923 T6D = VADD(T6B, T6C);
924 ST(&(ri[WS(rs, 15)]), VFNMS(LDK(KP980785280), T6D, T6A), ms, &(ri[WS(rs, 1)]));
925 ST(&(ri[WS(rs, 31)]), VFMA(LDK(KP980785280), T6D, T6A), ms, &(ri[WS(rs, 1)]));
926 T8Z = VFMA(LDK(KP923879532), T8W, T8V);
927 T90 = VADD(T6v, T6y);
928 ST(&(ii[WS(rs, 15)]), VFNMS(LDK(KP980785280), T90, T8Z), ms, &(ii[WS(rs, 1)]));
929 ST(&(ii[WS(rs, 31)]), VFMA(LDK(KP980785280), T90, T8Z), ms, &(ii[WS(rs, 1)]));
930 }
931 }
932 }
933 }
934 VLEAVE();
935 }
936
937 static const tw_instr twinstr[] = {
938 VTW(0, 1),
939 VTW(0, 2),
940 VTW(0, 3),
941 VTW(0, 4),
942 VTW(0, 5),
943 VTW(0, 6),
944 VTW(0, 7),
945 VTW(0, 8),
946 VTW(0, 9),
947 VTW(0, 10),
948 VTW(0, 11),
949 VTW(0, 12),
950 VTW(0, 13),
951 VTW(0, 14),
952 VTW(0, 15),
953 VTW(0, 16),
954 VTW(0, 17),
955 VTW(0, 18),
956 VTW(0, 19),
957 VTW(0, 20),
958 VTW(0, 21),
959 VTW(0, 22),
960 VTW(0, 23),
961 VTW(0, 24),
962 VTW(0, 25),
963 VTW(0, 26),
964 VTW(0, 27),
965 VTW(0, 28),
966 VTW(0, 29),
967 VTW(0, 30),
968 VTW(0, 31),
969 {TW_NEXT, (2 * VL), 0}
970 };
971
972 static const ct_desc desc = { 32, XSIMD_STRING("t1sv_32"), twinstr, &GENUS, {236, 62, 198, 0}, 0, 0, 0 };
973
974 void XSIMD(codelet_t1sv_32) (planner *p) {
975 X(kdft_dit_register) (p, t1sv_32, &desc);
976 }
977 #else
978
979 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1sv_32 -include dft/simd/ts.h */
980
981 /*
982 * This function contains 434 FP additions, 208 FP multiplications,
983 * (or, 340 additions, 114 multiplications, 94 fused multiply/add),
984 * 96 stack variables, 7 constants, and 128 memory accesses
985 */
986 #include "dft/simd/ts.h"
987
988 static void t1sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
989 {
990 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
991 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
992 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
993 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
994 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
995 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
996 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
997 {
998 INT m;
999 for (m = mb, W = W + (mb * 62); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 62), MAKE_VOLATILE_STRIDE(64, rs)) {
1000 V Tj, T5F, T7C, T7Q, T35, T4T, T78, T7m, T1Q, T61, T5Y, T6J, T3K, T59, T41;
1001 V T56, T2B, T67, T6e, T6O, T4b, T5d, T4s, T5g, TG, T7l, T5I, T73, T3a, T4U;
1002 V T3f, T4V, T14, T5N, T5M, T6E, T3m, T4Y, T3r, T4Z, T1r, T5P, T5S, T6F, T3x;
1003 V T51, T3C, T52, T2d, T5Z, T64, T6K, T3V, T57, T44, T5a, T2Y, T6f, T6a, T6P;
1004 V T4m, T5h, T4v, T5e;
1005 {
1006 V T1, T76, T6, T75, Tc, T32, Th, T33;
1007 T1 = LD(&(ri[0]), ms, &(ri[0]));
1008 T76 = LD(&(ii[0]), ms, &(ii[0]));
1009 {
1010 V T3, T5, T2, T4;
1011 T3 = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
1012 T5 = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
1013 T2 = LDW(&(W[TWVL * 30]));
1014 T4 = LDW(&(W[TWVL * 31]));
1015 T6 = VFMA(T2, T3, VMUL(T4, T5));
1016 T75 = VFNMS(T4, T3, VMUL(T2, T5));
1017 }
1018 {
1019 V T9, Tb, T8, Ta;
1020 T9 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
1021 Tb = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
1022 T8 = LDW(&(W[TWVL * 14]));
1023 Ta = LDW(&(W[TWVL * 15]));
1024 Tc = VFMA(T8, T9, VMUL(Ta, Tb));
1025 T32 = VFNMS(Ta, T9, VMUL(T8, Tb));
1026 }
1027 {
1028 V Te, Tg, Td, Tf;
1029 Te = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
1030 Tg = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
1031 Td = LDW(&(W[TWVL * 46]));
1032 Tf = LDW(&(W[TWVL * 47]));
1033 Th = VFMA(Td, Te, VMUL(Tf, Tg));
1034 T33 = VFNMS(Tf, Te, VMUL(Td, Tg));
1035 }
1036 {
1037 V T7, Ti, T7A, T7B;
1038 T7 = VADD(T1, T6);
1039 Ti = VADD(Tc, Th);
1040 Tj = VADD(T7, Ti);
1041 T5F = VSUB(T7, Ti);
1042 T7A = VSUB(T76, T75);
1043 T7B = VSUB(Tc, Th);
1044 T7C = VSUB(T7A, T7B);
1045 T7Q = VADD(T7B, T7A);
1046 }
1047 {
1048 V T31, T34, T74, T77;
1049 T31 = VSUB(T1, T6);
1050 T34 = VSUB(T32, T33);
1051 T35 = VSUB(T31, T34);
1052 T4T = VADD(T31, T34);
1053 T74 = VADD(T32, T33);
1054 T77 = VADD(T75, T76);
1055 T78 = VADD(T74, T77);
1056 T7m = VSUB(T77, T74);
1057 }
1058 }
1059 {
1060 V T1y, T3G, T1O, T3Z, T1D, T3H, T1J, T3Y;
1061 {
1062 V T1v, T1x, T1u, T1w;
1063 T1v = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
1064 T1x = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
1065 T1u = LDW(&(W[0]));
1066 T1w = LDW(&(W[TWVL * 1]));
1067 T1y = VFMA(T1u, T1v, VMUL(T1w, T1x));
1068 T3G = VFNMS(T1w, T1v, VMUL(T1u, T1x));
1069 }
1070 {
1071 V T1L, T1N, T1K, T1M;
1072 T1L = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
1073 T1N = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
1074 T1K = LDW(&(W[TWVL * 48]));
1075 T1M = LDW(&(W[TWVL * 49]));
1076 T1O = VFMA(T1K, T1L, VMUL(T1M, T1N));
1077 T3Z = VFNMS(T1M, T1L, VMUL(T1K, T1N));
1078 }
1079 {
1080 V T1A, T1C, T1z, T1B;
1081 T1A = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
1082 T1C = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
1083 T1z = LDW(&(W[TWVL * 32]));
1084 T1B = LDW(&(W[TWVL * 33]));
1085 T1D = VFMA(T1z, T1A, VMUL(T1B, T1C));
1086 T3H = VFNMS(T1B, T1A, VMUL(T1z, T1C));
1087 }
1088 {
1089 V T1G, T1I, T1F, T1H;
1090 T1G = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
1091 T1I = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
1092 T1F = LDW(&(W[TWVL * 16]));
1093 T1H = LDW(&(W[TWVL * 17]));
1094 T1J = VFMA(T1F, T1G, VMUL(T1H, T1I));
1095 T3Y = VFNMS(T1H, T1G, VMUL(T1F, T1I));
1096 }
1097 {
1098 V T1E, T1P, T5W, T5X;
1099 T1E = VADD(T1y, T1D);
1100 T1P = VADD(T1J, T1O);
1101 T1Q = VADD(T1E, T1P);
1102 T61 = VSUB(T1E, T1P);
1103 T5W = VADD(T3G, T3H);
1104 T5X = VADD(T3Y, T3Z);
1105 T5Y = VSUB(T5W, T5X);
1106 T6J = VADD(T5W, T5X);
1107 }
1108 {
1109 V T3I, T3J, T3X, T40;
1110 T3I = VSUB(T3G, T3H);
1111 T3J = VSUB(T1J, T1O);
1112 T3K = VADD(T3I, T3J);
1113 T59 = VSUB(T3I, T3J);
1114 T3X = VSUB(T1y, T1D);
1115 T40 = VSUB(T3Y, T3Z);
1116 T41 = VSUB(T3X, T40);
1117 T56 = VADD(T3X, T40);
1118 }
1119 }
1120 {
1121 V T2j, T4o, T2z, T49, T2o, T4p, T2u, T48;
1122 {
1123 V T2g, T2i, T2f, T2h;
1124 T2g = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
1125 T2i = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
1126 T2f = LDW(&(W[TWVL * 60]));
1127 T2h = LDW(&(W[TWVL * 61]));
1128 T2j = VFMA(T2f, T2g, VMUL(T2h, T2i));
1129 T4o = VFNMS(T2h, T2g, VMUL(T2f, T2i));
1130 }
1131 {
1132 V T2w, T2y, T2v, T2x;
1133 T2w = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
1134 T2y = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
1135 T2v = LDW(&(W[TWVL * 44]));
1136 T2x = LDW(&(W[TWVL * 45]));
1137 T2z = VFMA(T2v, T2w, VMUL(T2x, T2y));
1138 T49 = VFNMS(T2x, T2w, VMUL(T2v, T2y));
1139 }
1140 {
1141 V T2l, T2n, T2k, T2m;
1142 T2l = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
1143 T2n = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
1144 T2k = LDW(&(W[TWVL * 28]));
1145 T2m = LDW(&(W[TWVL * 29]));
1146 T2o = VFMA(T2k, T2l, VMUL(T2m, T2n));
1147 T4p = VFNMS(T2m, T2l, VMUL(T2k, T2n));
1148 }
1149 {
1150 V T2r, T2t, T2q, T2s;
1151 T2r = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
1152 T2t = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
1153 T2q = LDW(&(W[TWVL * 12]));
1154 T2s = LDW(&(W[TWVL * 13]));
1155 T2u = VFMA(T2q, T2r, VMUL(T2s, T2t));
1156 T48 = VFNMS(T2s, T2r, VMUL(T2q, T2t));
1157 }
1158 {
1159 V T2p, T2A, T6c, T6d;
1160 T2p = VADD(T2j, T2o);
1161 T2A = VADD(T2u, T2z);
1162 T2B = VADD(T2p, T2A);
1163 T67 = VSUB(T2p, T2A);
1164 T6c = VADD(T4o, T4p);
1165 T6d = VADD(T48, T49);
1166 T6e = VSUB(T6c, T6d);
1167 T6O = VADD(T6c, T6d);
1168 }
1169 {
1170 V T47, T4a, T4q, T4r;
1171 T47 = VSUB(T2j, T2o);
1172 T4a = VSUB(T48, T49);
1173 T4b = VSUB(T47, T4a);
1174 T5d = VADD(T47, T4a);
1175 T4q = VSUB(T4o, T4p);
1176 T4r = VSUB(T2u, T2z);
1177 T4s = VADD(T4q, T4r);
1178 T5g = VSUB(T4q, T4r);
1179 }
1180 }
1181 {
1182 V To, T36, TE, T3d, Tt, T37, Tz, T3c;
1183 {
1184 V Tl, Tn, Tk, Tm;
1185 Tl = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
1186 Tn = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
1187 Tk = LDW(&(W[TWVL * 6]));
1188 Tm = LDW(&(W[TWVL * 7]));
1189 To = VFMA(Tk, Tl, VMUL(Tm, Tn));
1190 T36 = VFNMS(Tm, Tl, VMUL(Tk, Tn));
1191 }
1192 {
1193 V TB, TD, TA, TC;
1194 TB = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
1195 TD = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
1196 TA = LDW(&(W[TWVL * 22]));
1197 TC = LDW(&(W[TWVL * 23]));
1198 TE = VFMA(TA, TB, VMUL(TC, TD));
1199 T3d = VFNMS(TC, TB, VMUL(TA, TD));
1200 }
1201 {
1202 V Tq, Ts, Tp, Tr;
1203 Tq = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
1204 Ts = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
1205 Tp = LDW(&(W[TWVL * 38]));
1206 Tr = LDW(&(W[TWVL * 39]));
1207 Tt = VFMA(Tp, Tq, VMUL(Tr, Ts));
1208 T37 = VFNMS(Tr, Tq, VMUL(Tp, Ts));
1209 }
1210 {
1211 V Tw, Ty, Tv, Tx;
1212 Tw = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
1213 Ty = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
1214 Tv = LDW(&(W[TWVL * 54]));
1215 Tx = LDW(&(W[TWVL * 55]));
1216 Tz = VFMA(Tv, Tw, VMUL(Tx, Ty));
1217 T3c = VFNMS(Tx, Tw, VMUL(Tv, Ty));
1218 }
1219 {
1220 V Tu, TF, T5G, T5H;
1221 Tu = VADD(To, Tt);
1222 TF = VADD(Tz, TE);
1223 TG = VADD(Tu, TF);
1224 T7l = VSUB(TF, Tu);
1225 T5G = VADD(T36, T37);
1226 T5H = VADD(T3c, T3d);
1227 T5I = VSUB(T5G, T5H);
1228 T73 = VADD(T5G, T5H);
1229 }
1230 {
1231 V T38, T39, T3b, T3e;
1232 T38 = VSUB(T36, T37);
1233 T39 = VSUB(To, Tt);
1234 T3a = VSUB(T38, T39);
1235 T4U = VADD(T39, T38);
1236 T3b = VSUB(Tz, TE);
1237 T3e = VSUB(T3c, T3d);
1238 T3f = VADD(T3b, T3e);
1239 T4V = VSUB(T3b, T3e);
1240 }
1241 }
1242 {
1243 V TM, T3i, T12, T3p, TR, T3j, TX, T3o;
1244 {
1245 V TJ, TL, TI, TK;
1246 TJ = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
1247 TL = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
1248 TI = LDW(&(W[TWVL * 2]));
1249 TK = LDW(&(W[TWVL * 3]));
1250 TM = VFMA(TI, TJ, VMUL(TK, TL));
1251 T3i = VFNMS(TK, TJ, VMUL(TI, TL));
1252 }
1253 {
1254 V TZ, T11, TY, T10;
1255 TZ = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
1256 T11 = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
1257 TY = LDW(&(W[TWVL * 50]));
1258 T10 = LDW(&(W[TWVL * 51]));
1259 T12 = VFMA(TY, TZ, VMUL(T10, T11));
1260 T3p = VFNMS(T10, TZ, VMUL(TY, T11));
1261 }
1262 {
1263 V TO, TQ, TN, TP;
1264 TO = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
1265 TQ = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
1266 TN = LDW(&(W[TWVL * 34]));
1267 TP = LDW(&(W[TWVL * 35]));
1268 TR = VFMA(TN, TO, VMUL(TP, TQ));
1269 T3j = VFNMS(TP, TO, VMUL(TN, TQ));
1270 }
1271 {
1272 V TU, TW, TT, TV;
1273 TU = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
1274 TW = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
1275 TT = LDW(&(W[TWVL * 18]));
1276 TV = LDW(&(W[TWVL * 19]));
1277 TX = VFMA(TT, TU, VMUL(TV, TW));
1278 T3o = VFNMS(TV, TU, VMUL(TT, TW));
1279 }
1280 {
1281 V TS, T13, T5K, T5L;
1282 TS = VADD(TM, TR);
1283 T13 = VADD(TX, T12);
1284 T14 = VADD(TS, T13);
1285 T5N = VSUB(TS, T13);
1286 T5K = VADD(T3i, T3j);
1287 T5L = VADD(T3o, T3p);
1288 T5M = VSUB(T5K, T5L);
1289 T6E = VADD(T5K, T5L);
1290 }
1291 {
1292 V T3k, T3l, T3n, T3q;
1293 T3k = VSUB(T3i, T3j);
1294 T3l = VSUB(TX, T12);
1295 T3m = VADD(T3k, T3l);
1296 T4Y = VSUB(T3k, T3l);
1297 T3n = VSUB(TM, TR);
1298 T3q = VSUB(T3o, T3p);
1299 T3r = VSUB(T3n, T3q);
1300 T4Z = VADD(T3n, T3q);
1301 }
1302 }
1303 {
1304 V T19, T3t, T1p, T3A, T1e, T3u, T1k, T3z;
1305 {
1306 V T16, T18, T15, T17;
1307 T16 = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
1308 T18 = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
1309 T15 = LDW(&(W[TWVL * 58]));
1310 T17 = LDW(&(W[TWVL * 59]));
1311 T19 = VFMA(T15, T16, VMUL(T17, T18));
1312 T3t = VFNMS(T17, T16, VMUL(T15, T18));
1313 }
1314 {
1315 V T1m, T1o, T1l, T1n;
1316 T1m = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
1317 T1o = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
1318 T1l = LDW(&(W[TWVL * 42]));
1319 T1n = LDW(&(W[TWVL * 43]));
1320 T1p = VFMA(T1l, T1m, VMUL(T1n, T1o));
1321 T3A = VFNMS(T1n, T1m, VMUL(T1l, T1o));
1322 }
1323 {
1324 V T1b, T1d, T1a, T1c;
1325 T1b = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
1326 T1d = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
1327 T1a = LDW(&(W[TWVL * 26]));
1328 T1c = LDW(&(W[TWVL * 27]));
1329 T1e = VFMA(T1a, T1b, VMUL(T1c, T1d));
1330 T3u = VFNMS(T1c, T1b, VMUL(T1a, T1d));
1331 }
1332 {
1333 V T1h, T1j, T1g, T1i;
1334 T1h = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
1335 T1j = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
1336 T1g = LDW(&(W[TWVL * 10]));
1337 T1i = LDW(&(W[TWVL * 11]));
1338 T1k = VFMA(T1g, T1h, VMUL(T1i, T1j));
1339 T3z = VFNMS(T1i, T1h, VMUL(T1g, T1j));
1340 }
1341 {
1342 V T1f, T1q, T5Q, T5R;
1343 T1f = VADD(T19, T1e);
1344 T1q = VADD(T1k, T1p);
1345 T1r = VADD(T1f, T1q);
1346 T5P = VSUB(T1f, T1q);
1347 T5Q = VADD(T3t, T3u);
1348 T5R = VADD(T3z, T3A);
1349 T5S = VSUB(T5Q, T5R);
1350 T6F = VADD(T5Q, T5R);
1351 }
1352 {
1353 V T3v, T3w, T3y, T3B;
1354 T3v = VSUB(T3t, T3u);
1355 T3w = VSUB(T1k, T1p);
1356 T3x = VADD(T3v, T3w);
1357 T51 = VSUB(T3v, T3w);
1358 T3y = VSUB(T19, T1e);
1359 T3B = VSUB(T3z, T3A);
1360 T3C = VSUB(T3y, T3B);
1361 T52 = VADD(T3y, T3B);
1362 }
1363 }
1364 {
1365 V T1V, T3R, T20, T3S, T3Q, T3T, T26, T3M, T2b, T3N, T3L, T3O;
1366 {
1367 V T1S, T1U, T1R, T1T;
1368 T1S = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
1369 T1U = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
1370 T1R = LDW(&(W[TWVL * 8]));
1371 T1T = LDW(&(W[TWVL * 9]));
1372 T1V = VFMA(T1R, T1S, VMUL(T1T, T1U));
1373 T3R = VFNMS(T1T, T1S, VMUL(T1R, T1U));
1374 }
1375 {
1376 V T1X, T1Z, T1W, T1Y;
1377 T1X = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
1378 T1Z = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
1379 T1W = LDW(&(W[TWVL * 40]));
1380 T1Y = LDW(&(W[TWVL * 41]));
1381 T20 = VFMA(T1W, T1X, VMUL(T1Y, T1Z));
1382 T3S = VFNMS(T1Y, T1X, VMUL(T1W, T1Z));
1383 }
1384 T3Q = VSUB(T1V, T20);
1385 T3T = VSUB(T3R, T3S);
1386 {
1387 V T23, T25, T22, T24;
1388 T23 = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
1389 T25 = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
1390 T22 = LDW(&(W[TWVL * 56]));
1391 T24 = LDW(&(W[TWVL * 57]));
1392 T26 = VFMA(T22, T23, VMUL(T24, T25));
1393 T3M = VFNMS(T24, T23, VMUL(T22, T25));
1394 }
1395 {
1396 V T28, T2a, T27, T29;
1397 T28 = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
1398 T2a = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
1399 T27 = LDW(&(W[TWVL * 24]));
1400 T29 = LDW(&(W[TWVL * 25]));
1401 T2b = VFMA(T27, T28, VMUL(T29, T2a));
1402 T3N = VFNMS(T29, T28, VMUL(T27, T2a));
1403 }
1404 T3L = VSUB(T26, T2b);
1405 T3O = VSUB(T3M, T3N);
1406 {
1407 V T21, T2c, T62, T63;
1408 T21 = VADD(T1V, T20);
1409 T2c = VADD(T26, T2b);
1410 T2d = VADD(T21, T2c);
1411 T5Z = VSUB(T2c, T21);
1412 T62 = VADD(T3R, T3S);
1413 T63 = VADD(T3M, T3N);
1414 T64 = VSUB(T62, T63);
1415 T6K = VADD(T62, T63);
1416 }
1417 {
1418 V T3P, T3U, T42, T43;
1419 T3P = VSUB(T3L, T3O);
1420 T3U = VADD(T3Q, T3T);
1421 T3V = VMUL(LDK(KP707106781), VSUB(T3P, T3U));
1422 T57 = VMUL(LDK(KP707106781), VADD(T3U, T3P));
1423 T42 = VSUB(T3T, T3Q);
1424 T43 = VADD(T3L, T3O);
1425 T44 = VMUL(LDK(KP707106781), VSUB(T42, T43));
1426 T5a = VMUL(LDK(KP707106781), VADD(T42, T43));
1427 }
1428 }
1429 {
1430 V T2G, T4c, T2L, T4d, T4e, T4f, T2R, T4i, T2W, T4j, T4h, T4k;
1431 {
1432 V T2D, T2F, T2C, T2E;
1433 T2D = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
1434 T2F = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
1435 T2C = LDW(&(W[TWVL * 4]));
1436 T2E = LDW(&(W[TWVL * 5]));
1437 T2G = VFMA(T2C, T2D, VMUL(T2E, T2F));
1438 T4c = VFNMS(T2E, T2D, VMUL(T2C, T2F));
1439 }
1440 {
1441 V T2I, T2K, T2H, T2J;
1442 T2I = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
1443 T2K = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
1444 T2H = LDW(&(W[TWVL * 36]));
1445 T2J = LDW(&(W[TWVL * 37]));
1446 T2L = VFMA(T2H, T2I, VMUL(T2J, T2K));
1447 T4d = VFNMS(T2J, T2I, VMUL(T2H, T2K));
1448 }
1449 T4e = VSUB(T4c, T4d);
1450 T4f = VSUB(T2G, T2L);
1451 {
1452 V T2O, T2Q, T2N, T2P;
1453 T2O = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
1454 T2Q = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
1455 T2N = LDW(&(W[TWVL * 52]));
1456 T2P = LDW(&(W[TWVL * 53]));
1457 T2R = VFMA(T2N, T2O, VMUL(T2P, T2Q));
1458 T4i = VFNMS(T2P, T2O, VMUL(T2N, T2Q));
1459 }
1460 {
1461 V T2T, T2V, T2S, T2U;
1462 T2T = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
1463 T2V = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
1464 T2S = LDW(&(W[TWVL * 20]));
1465 T2U = LDW(&(W[TWVL * 21]));
1466 T2W = VFMA(T2S, T2T, VMUL(T2U, T2V));
1467 T4j = VFNMS(T2U, T2T, VMUL(T2S, T2V));
1468 }
1469 T4h = VSUB(T2R, T2W);
1470 T4k = VSUB(T4i, T4j);
1471 {
1472 V T2M, T2X, T68, T69;
1473 T2M = VADD(T2G, T2L);
1474 T2X = VADD(T2R, T2W);
1475 T2Y = VADD(T2M, T2X);
1476 T6f = VSUB(T2X, T2M);
1477 T68 = VADD(T4c, T4d);
1478 T69 = VADD(T4i, T4j);
1479 T6a = VSUB(T68, T69);
1480 T6P = VADD(T68, T69);
1481 }
1482 {
1483 V T4g, T4l, T4t, T4u;
1484 T4g = VSUB(T4e, T4f);
1485 T4l = VADD(T4h, T4k);
1486 T4m = VMUL(LDK(KP707106781), VSUB(T4g, T4l));
1487 T5h = VMUL(LDK(KP707106781), VADD(T4g, T4l));
1488 T4t = VSUB(T4h, T4k);
1489 T4u = VADD(T4f, T4e);
1490 T4v = VMUL(LDK(KP707106781), VSUB(T4t, T4u));
1491 T5e = VMUL(LDK(KP707106781), VADD(T4u, T4t));
1492 }
1493 }
1494 {
1495 V T1t, T6X, T7a, T7c, T30, T7b, T70, T71;
1496 {
1497 V TH, T1s, T72, T79;
1498 TH = VADD(Tj, TG);
1499 T1s = VADD(T14, T1r);
1500 T1t = VADD(TH, T1s);
1501 T6X = VSUB(TH, T1s);
1502 T72 = VADD(T6E, T6F);
1503 T79 = VADD(T73, T78);
1504 T7a = VADD(T72, T79);
1505 T7c = VSUB(T79, T72);
1506 }
1507 {
1508 V T2e, T2Z, T6Y, T6Z;
1509 T2e = VADD(T1Q, T2d);
1510 T2Z = VADD(T2B, T2Y);
1511 T30 = VADD(T2e, T2Z);
1512 T7b = VSUB(T2Z, T2e);
1513 T6Y = VADD(T6J, T6K);
1514 T6Z = VADD(T6O, T6P);
1515 T70 = VSUB(T6Y, T6Z);
1516 T71 = VADD(T6Y, T6Z);
1517 }
1518 ST(&(ri[WS(rs, 16)]), VSUB(T1t, T30), ms, &(ri[0]));
1519 ST(&(ii[WS(rs, 16)]), VSUB(T7a, T71), ms, &(ii[0]));
1520 ST(&(ri[0]), VADD(T1t, T30), ms, &(ri[0]));
1521 ST(&(ii[0]), VADD(T71, T7a), ms, &(ii[0]));
1522 ST(&(ri[WS(rs, 24)]), VSUB(T6X, T70), ms, &(ri[0]));
1523 ST(&(ii[WS(rs, 24)]), VSUB(T7c, T7b), ms, &(ii[0]));
1524 ST(&(ri[WS(rs, 8)]), VADD(T6X, T70), ms, &(ri[0]));
1525 ST(&(ii[WS(rs, 8)]), VADD(T7b, T7c), ms, &(ii[0]));
1526 }
1527 {
1528 V T6H, T6T, T7g, T7i, T6M, T6U, T6R, T6V;
1529 {
1530 V T6D, T6G, T7e, T7f;
1531 T6D = VSUB(Tj, TG);
1532 T6G = VSUB(T6E, T6F);
1533 T6H = VADD(T6D, T6G);
1534 T6T = VSUB(T6D, T6G);
1535 T7e = VSUB(T1r, T14);
1536 T7f = VSUB(T78, T73);
1537 T7g = VADD(T7e, T7f);
1538 T7i = VSUB(T7f, T7e);
1539 }
1540 {
1541 V T6I, T6L, T6N, T6Q;
1542 T6I = VSUB(T1Q, T2d);
1543 T6L = VSUB(T6J, T6K);
1544 T6M = VADD(T6I, T6L);
1545 T6U = VSUB(T6L, T6I);
1546 T6N = VSUB(T2B, T2Y);
1547 T6Q = VSUB(T6O, T6P);
1548 T6R = VSUB(T6N, T6Q);
1549 T6V = VADD(T6N, T6Q);
1550 }
1551 {
1552 V T6S, T7d, T6W, T7h;
1553 T6S = VMUL(LDK(KP707106781), VADD(T6M, T6R));
1554 ST(&(ri[WS(rs, 20)]), VSUB(T6H, T6S), ms, &(ri[0]));
1555 ST(&(ri[WS(rs, 4)]), VADD(T6H, T6S), ms, &(ri[0]));
1556 T7d = VMUL(LDK(KP707106781), VADD(T6U, T6V));
1557 ST(&(ii[WS(rs, 4)]), VADD(T7d, T7g), ms, &(ii[0]));
1558 ST(&(ii[WS(rs, 20)]), VSUB(T7g, T7d), ms, &(ii[0]));
1559 T6W = VMUL(LDK(KP707106781), VSUB(T6U, T6V));
1560 ST(&(ri[WS(rs, 28)]), VSUB(T6T, T6W), ms, &(ri[0]));
1561 ST(&(ri[WS(rs, 12)]), VADD(T6T, T6W), ms, &(ri[0]));
1562 T7h = VMUL(LDK(KP707106781), VSUB(T6R, T6M));
1563 ST(&(ii[WS(rs, 12)]), VADD(T7h, T7i), ms, &(ii[0]));
1564 ST(&(ii[WS(rs, 28)]), VSUB(T7i, T7h), ms, &(ii[0]));
1565 }
1566 }
1567 {
1568 V T5J, T7n, T7t, T6n, T5U, T7k, T6x, T6B, T6q, T7s, T66, T6k, T6u, T6A, T6h;
1569 V T6l;
1570 {
1571 V T5O, T5T, T60, T65;
1572 T5J = VSUB(T5F, T5I);
1573 T7n = VADD(T7l, T7m);
1574 T7t = VSUB(T7m, T7l);
1575 T6n = VADD(T5F, T5I);
1576 T5O = VSUB(T5M, T5N);
1577 T5T = VADD(T5P, T5S);
1578 T5U = VMUL(LDK(KP707106781), VSUB(T5O, T5T));
1579 T7k = VMUL(LDK(KP707106781), VADD(T5O, T5T));
1580 {
1581 V T6v, T6w, T6o, T6p;
1582 T6v = VADD(T67, T6a);
1583 T6w = VADD(T6e, T6f);
1584 T6x = VFNMS(LDK(KP382683432), T6w, VMUL(LDK(KP923879532), T6v));
1585 T6B = VFMA(LDK(KP923879532), T6w, VMUL(LDK(KP382683432), T6v));
1586 T6o = VADD(T5N, T5M);
1587 T6p = VSUB(T5P, T5S);
1588 T6q = VMUL(LDK(KP707106781), VADD(T6o, T6p));
1589 T7s = VMUL(LDK(KP707106781), VSUB(T6p, T6o));
1590 }
1591 T60 = VSUB(T5Y, T5Z);
1592 T65 = VSUB(T61, T64);
1593 T66 = VFMA(LDK(KP923879532), T60, VMUL(LDK(KP382683432), T65));
1594 T6k = VFNMS(LDK(KP923879532), T65, VMUL(LDK(KP382683432), T60));
1595 {
1596 V T6s, T6t, T6b, T6g;
1597 T6s = VADD(T5Y, T5Z);
1598 T6t = VADD(T61, T64);
1599 T6u = VFMA(LDK(KP382683432), T6s, VMUL(LDK(KP923879532), T6t));
1600 T6A = VFNMS(LDK(KP382683432), T6t, VMUL(LDK(KP923879532), T6s));
1601 T6b = VSUB(T67, T6a);
1602 T6g = VSUB(T6e, T6f);
1603 T6h = VFNMS(LDK(KP923879532), T6g, VMUL(LDK(KP382683432), T6b));
1604 T6l = VFMA(LDK(KP382683432), T6g, VMUL(LDK(KP923879532), T6b));
1605 }
1606 }
1607 {
1608 V T5V, T6i, T7r, T7u;
1609 T5V = VADD(T5J, T5U);
1610 T6i = VADD(T66, T6h);
1611 ST(&(ri[WS(rs, 22)]), VSUB(T5V, T6i), ms, &(ri[0]));
1612 ST(&(ri[WS(rs, 6)]), VADD(T5V, T6i), ms, &(ri[0]));
1613 T7r = VADD(T6k, T6l);
1614 T7u = VADD(T7s, T7t);
1615 ST(&(ii[WS(rs, 6)]), VADD(T7r, T7u), ms, &(ii[0]));
1616 ST(&(ii[WS(rs, 22)]), VSUB(T7u, T7r), ms, &(ii[0]));
1617 }
1618 {
1619 V T6j, T6m, T7v, T7w;
1620 T6j = VSUB(T5J, T5U);
1621 T6m = VSUB(T6k, T6l);
1622 ST(&(ri[WS(rs, 30)]), VSUB(T6j, T6m), ms, &(ri[0]));
1623 ST(&(ri[WS(rs, 14)]), VADD(T6j, T6m), ms, &(ri[0]));
1624 T7v = VSUB(T6h, T66);
1625 T7w = VSUB(T7t, T7s);
1626 ST(&(ii[WS(rs, 14)]), VADD(T7v, T7w), ms, &(ii[0]));
1627 ST(&(ii[WS(rs, 30)]), VSUB(T7w, T7v), ms, &(ii[0]));
1628 }
1629 {
1630 V T6r, T6y, T7j, T7o;
1631 T6r = VADD(T6n, T6q);
1632 T6y = VADD(T6u, T6x);
1633 ST(&(ri[WS(rs, 18)]), VSUB(T6r, T6y), ms, &(ri[0]));
1634 ST(&(ri[WS(rs, 2)]), VADD(T6r, T6y), ms, &(ri[0]));
1635 T7j = VADD(T6A, T6B);
1636 T7o = VADD(T7k, T7n);
1637 ST(&(ii[WS(rs, 2)]), VADD(T7j, T7o), ms, &(ii[0]));
1638 ST(&(ii[WS(rs, 18)]), VSUB(T7o, T7j), ms, &(ii[0]));
1639 }
1640 {
1641 V T6z, T6C, T7p, T7q;
1642 T6z = VSUB(T6n, T6q);
1643 T6C = VSUB(T6A, T6B);
1644 ST(&(ri[WS(rs, 26)]), VSUB(T6z, T6C), ms, &(ri[0]));
1645 ST(&(ri[WS(rs, 10)]), VADD(T6z, T6C), ms, &(ri[0]));
1646 T7p = VSUB(T6x, T6u);
1647 T7q = VSUB(T7n, T7k);
1648 ST(&(ii[WS(rs, 10)]), VADD(T7p, T7q), ms, &(ii[0]));
1649 ST(&(ii[WS(rs, 26)]), VSUB(T7q, T7p), ms, &(ii[0]));
1650 }
1651 }
1652 {
1653 V T3h, T4D, T7R, T7X, T3E, T7O, T4N, T4R, T46, T4A, T4G, T7W, T4K, T4Q, T4x;
1654 V T4B, T3g, T7P;
1655 T3g = VMUL(LDK(KP707106781), VSUB(T3a, T3f));
1656 T3h = VSUB(T35, T3g);
1657 T4D = VADD(T35, T3g);
1658 T7P = VMUL(LDK(KP707106781), VSUB(T4V, T4U));
1659 T7R = VADD(T7P, T7Q);
1660 T7X = VSUB(T7Q, T7P);
1661 {
1662 V T3s, T3D, T4L, T4M;
1663 T3s = VFNMS(LDK(KP923879532), T3r, VMUL(LDK(KP382683432), T3m));
1664 T3D = VFMA(LDK(KP382683432), T3x, VMUL(LDK(KP923879532), T3C));
1665 T3E = VSUB(T3s, T3D);
1666 T7O = VADD(T3s, T3D);
1667 T4L = VADD(T4b, T4m);
1668 T4M = VADD(T4s, T4v);
1669 T4N = VFNMS(LDK(KP555570233), T4M, VMUL(LDK(KP831469612), T4L));
1670 T4R = VFMA(LDK(KP831469612), T4M, VMUL(LDK(KP555570233), T4L));
1671 }
1672 {
1673 V T3W, T45, T4E, T4F;
1674 T3W = VSUB(T3K, T3V);
1675 T45 = VSUB(T41, T44);
1676 T46 = VFMA(LDK(KP980785280), T3W, VMUL(LDK(KP195090322), T45));
1677 T4A = VFNMS(LDK(KP980785280), T45, VMUL(LDK(KP195090322), T3W));
1678 T4E = VFMA(LDK(KP923879532), T3m, VMUL(LDK(KP382683432), T3r));
1679 T4F = VFNMS(LDK(KP923879532), T3x, VMUL(LDK(KP382683432), T3C));
1680 T4G = VADD(T4E, T4F);
1681 T7W = VSUB(T4F, T4E);
1682 }
1683 {
1684 V T4I, T4J, T4n, T4w;
1685 T4I = VADD(T3K, T3V);
1686 T4J = VADD(T41, T44);
1687 T4K = VFMA(LDK(KP555570233), T4I, VMUL(LDK(KP831469612), T4J));
1688 T4Q = VFNMS(LDK(KP555570233), T4J, VMUL(LDK(KP831469612), T4I));
1689 T4n = VSUB(T4b, T4m);
1690 T4w = VSUB(T4s, T4v);
1691 T4x = VFNMS(LDK(KP980785280), T4w, VMUL(LDK(KP195090322), T4n));
1692 T4B = VFMA(LDK(KP195090322), T4w, VMUL(LDK(KP980785280), T4n));
1693 }
1694 {
1695 V T3F, T4y, T7V, T7Y;
1696 T3F = VADD(T3h, T3E);
1697 T4y = VADD(T46, T4x);
1698 ST(&(ri[WS(rs, 23)]), VSUB(T3F, T4y), ms, &(ri[WS(rs, 1)]));
1699 ST(&(ri[WS(rs, 7)]), VADD(T3F, T4y), ms, &(ri[WS(rs, 1)]));
1700 T7V = VADD(T4A, T4B);
1701 T7Y = VADD(T7W, T7X);
1702 ST(&(ii[WS(rs, 7)]), VADD(T7V, T7Y), ms, &(ii[WS(rs, 1)]));
1703 ST(&(ii[WS(rs, 23)]), VSUB(T7Y, T7V), ms, &(ii[WS(rs, 1)]));
1704 }
1705 {
1706 V T4z, T4C, T7Z, T80;
1707 T4z = VSUB(T3h, T3E);
1708 T4C = VSUB(T4A, T4B);
1709 ST(&(ri[WS(rs, 31)]), VSUB(T4z, T4C), ms, &(ri[WS(rs, 1)]));
1710 ST(&(ri[WS(rs, 15)]), VADD(T4z, T4C), ms, &(ri[WS(rs, 1)]));
1711 T7Z = VSUB(T4x, T46);
1712 T80 = VSUB(T7X, T7W);
1713 ST(&(ii[WS(rs, 15)]), VADD(T7Z, T80), ms, &(ii[WS(rs, 1)]));
1714 ST(&(ii[WS(rs, 31)]), VSUB(T80, T7Z), ms, &(ii[WS(rs, 1)]));
1715 }
1716 {
1717 V T4H, T4O, T7N, T7S;
1718 T4H = VADD(T4D, T4G);
1719 T4O = VADD(T4K, T4N);
1720 ST(&(ri[WS(rs, 19)]), VSUB(T4H, T4O), ms, &(ri[WS(rs, 1)]));
1721 ST(&(ri[WS(rs, 3)]), VADD(T4H, T4O), ms, &(ri[WS(rs, 1)]));
1722 T7N = VADD(T4Q, T4R);
1723 T7S = VADD(T7O, T7R);
1724 ST(&(ii[WS(rs, 3)]), VADD(T7N, T7S), ms, &(ii[WS(rs, 1)]));
1725 ST(&(ii[WS(rs, 19)]), VSUB(T7S, T7N), ms, &(ii[WS(rs, 1)]));
1726 }
1727 {
1728 V T4P, T4S, T7T, T7U;
1729 T4P = VSUB(T4D, T4G);
1730 T4S = VSUB(T4Q, T4R);
1731 ST(&(ri[WS(rs, 27)]), VSUB(T4P, T4S), ms, &(ri[WS(rs, 1)]));
1732 ST(&(ri[WS(rs, 11)]), VADD(T4P, T4S), ms, &(ri[WS(rs, 1)]));
1733 T7T = VSUB(T4N, T4K);
1734 T7U = VSUB(T7R, T7O);
1735 ST(&(ii[WS(rs, 11)]), VADD(T7T, T7U), ms, &(ii[WS(rs, 1)]));
1736 ST(&(ii[WS(rs, 27)]), VSUB(T7U, T7T), ms, &(ii[WS(rs, 1)]));
1737 }
1738 }
1739 {
1740 V T4X, T5p, T7D, T7J, T54, T7y, T5z, T5D, T5c, T5m, T5s, T7I, T5w, T5C, T5j;
1741 V T5n, T4W, T7z;
1742 T4W = VMUL(LDK(KP707106781), VADD(T4U, T4V));
1743 T4X = VSUB(T4T, T4W);
1744 T5p = VADD(T4T, T4W);
1745 T7z = VMUL(LDK(KP707106781), VADD(T3a, T3f));
1746 T7D = VADD(T7z, T7C);
1747 T7J = VSUB(T7C, T7z);
1748 {
1749 V T50, T53, T5x, T5y;
1750 T50 = VFNMS(LDK(KP382683432), T4Z, VMUL(LDK(KP923879532), T4Y));
1751 T53 = VFMA(LDK(KP923879532), T51, VMUL(LDK(KP382683432), T52));
1752 T54 = VSUB(T50, T53);
1753 T7y = VADD(T50, T53);
1754 T5x = VADD(T5d, T5e);
1755 T5y = VADD(T5g, T5h);
1756 T5z = VFNMS(LDK(KP195090322), T5y, VMUL(LDK(KP980785280), T5x));
1757 T5D = VFMA(LDK(KP195090322), T5x, VMUL(LDK(KP980785280), T5y));
1758 }
1759 {
1760 V T58, T5b, T5q, T5r;
1761 T58 = VSUB(T56, T57);
1762 T5b = VSUB(T59, T5a);
1763 T5c = VFMA(LDK(KP555570233), T58, VMUL(LDK(KP831469612), T5b));
1764 T5m = VFNMS(LDK(KP831469612), T58, VMUL(LDK(KP555570233), T5b));
1765 T5q = VFMA(LDK(KP382683432), T4Y, VMUL(LDK(KP923879532), T4Z));
1766 T5r = VFNMS(LDK(KP382683432), T51, VMUL(LDK(KP923879532), T52));
1767 T5s = VADD(T5q, T5r);
1768 T7I = VSUB(T5r, T5q);
1769 }
1770 {
1771 V T5u, T5v, T5f, T5i;
1772 T5u = VADD(T56, T57);
1773 T5v = VADD(T59, T5a);
1774 T5w = VFMA(LDK(KP980785280), T5u, VMUL(LDK(KP195090322), T5v));
1775 T5C = VFNMS(LDK(KP195090322), T5u, VMUL(LDK(KP980785280), T5v));
1776 T5f = VSUB(T5d, T5e);
1777 T5i = VSUB(T5g, T5h);
1778 T5j = VFNMS(LDK(KP831469612), T5i, VMUL(LDK(KP555570233), T5f));
1779 T5n = VFMA(LDK(KP831469612), T5f, VMUL(LDK(KP555570233), T5i));
1780 }
1781 {
1782 V T55, T5k, T7H, T7K;
1783 T55 = VADD(T4X, T54);
1784 T5k = VADD(T5c, T5j);
1785 ST(&(ri[WS(rs, 21)]), VSUB(T55, T5k), ms, &(ri[WS(rs, 1)]));
1786 ST(&(ri[WS(rs, 5)]), VADD(T55, T5k), ms, &(ri[WS(rs, 1)]));
1787 T7H = VADD(T5m, T5n);
1788 T7K = VADD(T7I, T7J);
1789 ST(&(ii[WS(rs, 5)]), VADD(T7H, T7K), ms, &(ii[WS(rs, 1)]));
1790 ST(&(ii[WS(rs, 21)]), VSUB(T7K, T7H), ms, &(ii[WS(rs, 1)]));
1791 }
1792 {
1793 V T5l, T5o, T7L, T7M;
1794 T5l = VSUB(T4X, T54);
1795 T5o = VSUB(T5m, T5n);
1796 ST(&(ri[WS(rs, 29)]), VSUB(T5l, T5o), ms, &(ri[WS(rs, 1)]));
1797 ST(&(ri[WS(rs, 13)]), VADD(T5l, T5o), ms, &(ri[WS(rs, 1)]));
1798 T7L = VSUB(T5j, T5c);
1799 T7M = VSUB(T7J, T7I);
1800 ST(&(ii[WS(rs, 13)]), VADD(T7L, T7M), ms, &(ii[WS(rs, 1)]));
1801 ST(&(ii[WS(rs, 29)]), VSUB(T7M, T7L), ms, &(ii[WS(rs, 1)]));
1802 }
1803 {
1804 V T5t, T5A, T7x, T7E;
1805 T5t = VADD(T5p, T5s);
1806 T5A = VADD(T5w, T5z);
1807 ST(&(ri[WS(rs, 17)]), VSUB(T5t, T5A), ms, &(ri[WS(rs, 1)]));
1808 ST(&(ri[WS(rs, 1)]), VADD(T5t, T5A), ms, &(ri[WS(rs, 1)]));
1809 T7x = VADD(T5C, T5D);
1810 T7E = VADD(T7y, T7D);
1811 ST(&(ii[WS(rs, 1)]), VADD(T7x, T7E), ms, &(ii[WS(rs, 1)]));
1812 ST(&(ii[WS(rs, 17)]), VSUB(T7E, T7x), ms, &(ii[WS(rs, 1)]));
1813 }
1814 {
1815 V T5B, T5E, T7F, T7G;
1816 T5B = VSUB(T5p, T5s);
1817 T5E = VSUB(T5C, T5D);
1818 ST(&(ri[WS(rs, 25)]), VSUB(T5B, T5E), ms, &(ri[WS(rs, 1)]));
1819 ST(&(ri[WS(rs, 9)]), VADD(T5B, T5E), ms, &(ri[WS(rs, 1)]));
1820 T7F = VSUB(T5z, T5w);
1821 T7G = VSUB(T7D, T7y);
1822 ST(&(ii[WS(rs, 9)]), VADD(T7F, T7G), ms, &(ii[WS(rs, 1)]));
1823 ST(&(ii[WS(rs, 25)]), VSUB(T7G, T7F), ms, &(ii[WS(rs, 1)]));
1824 }
1825 }
1826 }
1827 }
1828 VLEAVE();
1829 }
1830
1831 static const tw_instr twinstr[] = {
1832 VTW(0, 1),
1833 VTW(0, 2),
1834 VTW(0, 3),
1835 VTW(0, 4),
1836 VTW(0, 5),
1837 VTW(0, 6),
1838 VTW(0, 7),
1839 VTW(0, 8),
1840 VTW(0, 9),
1841 VTW(0, 10),
1842 VTW(0, 11),
1843 VTW(0, 12),
1844 VTW(0, 13),
1845 VTW(0, 14),
1846 VTW(0, 15),
1847 VTW(0, 16),
1848 VTW(0, 17),
1849 VTW(0, 18),
1850 VTW(0, 19),
1851 VTW(0, 20),
1852 VTW(0, 21),
1853 VTW(0, 22),
1854 VTW(0, 23),
1855 VTW(0, 24),
1856 VTW(0, 25),
1857 VTW(0, 26),
1858 VTW(0, 27),
1859 VTW(0, 28),
1860 VTW(0, 29),
1861 VTW(0, 30),
1862 VTW(0, 31),
1863 {TW_NEXT, (2 * VL), 0}
1864 };
1865
1866 static const ct_desc desc = { 32, XSIMD_STRING("t1sv_32"), twinstr, &GENUS, {340, 114, 94, 0}, 0, 0, 0 };
1867
1868 void XSIMD(codelet_t1sv_32) (planner *p) {
1869 X(kdft_dit_register) (p, t1sv_32, &desc);
1870 }
1871 #endif