comparison src/fftw-3.3.8/dft/simd/common/t2fv_32.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:05:43 EDT 2018 */
23
24 #include "dft/codelet-dft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2fv_32 -include dft/simd/t2f.h */
29
30 /*
31 * This function contains 217 FP additions, 160 FP multiplications,
32 * (or, 119 additions, 62 multiplications, 98 fused multiply/add),
33 * 59 stack variables, 7 constants, and 64 memory accesses
34 */
35 #include "dft/simd/t2f.h"
36
37 static void t2fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
40 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
41 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
44 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
46 {
47 INT m;
48 R *x;
49 x = ri;
50 for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
51 V T4, T1z, T2o, T32, Tf, T1A, T2r, T3f, TC, T1D, T2L, T34, Tr, T1C, T2O;
52 V T33, T1k, T20, T2F, T3b, T1r, T21, T2C, T3a, TV, T1X, T2y, T38, T12, T1Y;
53 V T2v, T37;
54 {
55 V T1, T1y, T3, T1w, T1x, T2, T1v, T2m, T2n;
56 T1 = LD(&(x[0]), ms, &(x[0]));
57 T1x = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
58 T1y = BYTWJ(&(W[TWVL * 46]), T1x);
59 T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
60 T3 = BYTWJ(&(W[TWVL * 30]), T2);
61 T1v = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
62 T1w = BYTWJ(&(W[TWVL * 14]), T1v);
63 T4 = VSUB(T1, T3);
64 T1z = VSUB(T1w, T1y);
65 T2m = VADD(T1, T3);
66 T2n = VADD(T1w, T1y);
67 T2o = VADD(T2m, T2n);
68 T32 = VSUB(T2m, T2n);
69 }
70 {
71 V T6, Td, T8, Tb;
72 {
73 V T5, Tc, T7, Ta;
74 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
75 T6 = BYTWJ(&(W[TWVL * 6]), T5);
76 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
77 Td = BYTWJ(&(W[TWVL * 22]), Tc);
78 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
79 T8 = BYTWJ(&(W[TWVL * 38]), T7);
80 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
81 Tb = BYTWJ(&(W[TWVL * 54]), Ta);
82 }
83 {
84 V T9, Te, T2p, T2q;
85 T9 = VSUB(T6, T8);
86 Te = VSUB(Tb, Td);
87 Tf = VADD(T9, Te);
88 T1A = VSUB(Te, T9);
89 T2p = VADD(T6, T8);
90 T2q = VADD(Tb, Td);
91 T2r = VADD(T2p, T2q);
92 T3f = VSUB(T2q, T2p);
93 }
94 }
95 {
96 V Tt, TA, Tv, Ty;
97 {
98 V Ts, Tz, Tu, Tx;
99 Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
100 Tt = BYTWJ(&(W[TWVL * 58]), Ts);
101 Tz = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
102 TA = BYTWJ(&(W[TWVL * 10]), Tz);
103 Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
104 Tv = BYTWJ(&(W[TWVL * 26]), Tu);
105 Tx = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
106 Ty = BYTWJ(&(W[TWVL * 42]), Tx);
107 }
108 {
109 V Tw, TB, T2J, T2K;
110 Tw = VSUB(Tt, Tv);
111 TB = VSUB(Ty, TA);
112 TC = VFNMS(LDK(KP414213562), TB, Tw);
113 T1D = VFMA(LDK(KP414213562), Tw, TB);
114 T2J = VADD(Tt, Tv);
115 T2K = VADD(TA, Ty);
116 T2L = VADD(T2J, T2K);
117 T34 = VSUB(T2J, T2K);
118 }
119 }
120 {
121 V Ti, Tp, Tk, Tn;
122 {
123 V Th, To, Tj, Tm;
124 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
125 Ti = BYTWJ(&(W[TWVL * 2]), Th);
126 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
127 Tp = BYTWJ(&(W[TWVL * 50]), To);
128 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
129 Tk = BYTWJ(&(W[TWVL * 34]), Tj);
130 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
131 Tn = BYTWJ(&(W[TWVL * 18]), Tm);
132 }
133 {
134 V Tl, Tq, T2M, T2N;
135 Tl = VSUB(Ti, Tk);
136 Tq = VSUB(Tn, Tp);
137 Tr = VFNMS(LDK(KP414213562), Tq, Tl);
138 T1C = VFMA(LDK(KP414213562), Tl, Tq);
139 T2M = VADD(Ti, Tk);
140 T2N = VADD(Tn, Tp);
141 T2O = VADD(T2M, T2N);
142 T33 = VSUB(T2M, T2N);
143 }
144 }
145 {
146 V T15, T17, T1o, T1m, T1f, T1h, T1i, T1a, T1c, T1d;
147 {
148 V T14, T16, T1n, T1l;
149 T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
150 T15 = BYTWJ(&(W[TWVL * 60]), T14);
151 T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
152 T17 = BYTWJ(&(W[TWVL * 28]), T16);
153 T1n = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
154 T1o = BYTWJ(&(W[TWVL * 12]), T1n);
155 T1l = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
156 T1m = BYTWJ(&(W[TWVL * 44]), T1l);
157 {
158 V T1e, T1g, T19, T1b;
159 T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
160 T1f = BYTWJ(&(W[TWVL * 52]), T1e);
161 T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
162 T1h = BYTWJ(&(W[TWVL * 20]), T1g);
163 T1i = VSUB(T1f, T1h);
164 T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
165 T1a = BYTWJ(&(W[TWVL * 4]), T19);
166 T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
167 T1c = BYTWJ(&(W[TWVL * 36]), T1b);
168 T1d = VSUB(T1a, T1c);
169 }
170 }
171 {
172 V T18, T1j, T2D, T2E;
173 T18 = VSUB(T15, T17);
174 T1j = VADD(T1d, T1i);
175 T1k = VFMA(LDK(KP707106781), T1j, T18);
176 T20 = VFNMS(LDK(KP707106781), T1j, T18);
177 T2D = VADD(T1a, T1c);
178 T2E = VADD(T1f, T1h);
179 T2F = VADD(T2D, T2E);
180 T3b = VSUB(T2E, T2D);
181 }
182 {
183 V T1p, T1q, T2A, T2B;
184 T1p = VSUB(T1m, T1o);
185 T1q = VSUB(T1i, T1d);
186 T1r = VFMA(LDK(KP707106781), T1q, T1p);
187 T21 = VFNMS(LDK(KP707106781), T1q, T1p);
188 T2A = VADD(T15, T17);
189 T2B = VADD(T1o, T1m);
190 T2C = VADD(T2A, T2B);
191 T3a = VSUB(T2A, T2B);
192 }
193 }
194 {
195 V TG, TI, TZ, TX, TQ, TS, TT, TL, TN, TO;
196 {
197 V TF, TH, TY, TW;
198 TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
199 TG = BYTWJ(&(W[0]), TF);
200 TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
201 TI = BYTWJ(&(W[TWVL * 32]), TH);
202 TY = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
203 TZ = BYTWJ(&(W[TWVL * 48]), TY);
204 TW = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
205 TX = BYTWJ(&(W[TWVL * 16]), TW);
206 {
207 V TP, TR, TK, TM;
208 TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
209 TQ = BYTWJ(&(W[TWVL * 56]), TP);
210 TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
211 TS = BYTWJ(&(W[TWVL * 24]), TR);
212 TT = VSUB(TQ, TS);
213 TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
214 TL = BYTWJ(&(W[TWVL * 8]), TK);
215 TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
216 TN = BYTWJ(&(W[TWVL * 40]), TM);
217 TO = VSUB(TL, TN);
218 }
219 }
220 {
221 V TJ, TU, T2w, T2x;
222 TJ = VSUB(TG, TI);
223 TU = VADD(TO, TT);
224 TV = VFMA(LDK(KP707106781), TU, TJ);
225 T1X = VFNMS(LDK(KP707106781), TU, TJ);
226 T2w = VADD(TL, TN);
227 T2x = VADD(TQ, TS);
228 T2y = VADD(T2w, T2x);
229 T38 = VSUB(T2w, T2x);
230 }
231 {
232 V T10, T11, T2t, T2u;
233 T10 = VSUB(TX, TZ);
234 T11 = VSUB(TO, TT);
235 T12 = VFMA(LDK(KP707106781), T11, T10);
236 T1Y = VFNMS(LDK(KP707106781), T11, T10);
237 T2t = VADD(TG, TI);
238 T2u = VADD(TX, TZ);
239 T2v = VADD(T2t, T2u);
240 T37 = VSUB(T2t, T2u);
241 }
242 }
243 {
244 V T2W, T30, T2Z, T31;
245 {
246 V T2U, T2V, T2X, T2Y;
247 T2U = VADD(T2o, T2r);
248 T2V = VADD(T2O, T2L);
249 T2W = VADD(T2U, T2V);
250 T30 = VSUB(T2U, T2V);
251 T2X = VADD(T2v, T2y);
252 T2Y = VADD(T2C, T2F);
253 T2Z = VADD(T2X, T2Y);
254 T31 = VSUB(T2Y, T2X);
255 }
256 ST(&(x[WS(rs, 16)]), VSUB(T2W, T2Z), ms, &(x[0]));
257 ST(&(x[WS(rs, 8)]), VFMAI(T31, T30), ms, &(x[0]));
258 ST(&(x[0]), VADD(T2W, T2Z), ms, &(x[0]));
259 ST(&(x[WS(rs, 24)]), VFNMSI(T31, T30), ms, &(x[0]));
260 }
261 {
262 V T2s, T2P, T2H, T2Q, T2z, T2G;
263 T2s = VSUB(T2o, T2r);
264 T2P = VSUB(T2L, T2O);
265 T2z = VSUB(T2v, T2y);
266 T2G = VSUB(T2C, T2F);
267 T2H = VADD(T2z, T2G);
268 T2Q = VSUB(T2G, T2z);
269 {
270 V T2I, T2R, T2S, T2T;
271 T2I = VFNMS(LDK(KP707106781), T2H, T2s);
272 T2R = VFNMS(LDK(KP707106781), T2Q, T2P);
273 ST(&(x[WS(rs, 12)]), VFNMSI(T2R, T2I), ms, &(x[0]));
274 ST(&(x[WS(rs, 20)]), VFMAI(T2R, T2I), ms, &(x[0]));
275 T2S = VFMA(LDK(KP707106781), T2H, T2s);
276 T2T = VFMA(LDK(KP707106781), T2Q, T2P);
277 ST(&(x[WS(rs, 28)]), VFNMSI(T2T, T2S), ms, &(x[0]));
278 ST(&(x[WS(rs, 4)]), VFMAI(T2T, T2S), ms, &(x[0]));
279 }
280 }
281 {
282 V T36, T3o, T3h, T3r, T3d, T3s, T3k, T3p, T35, T3g;
283 T35 = VADD(T33, T34);
284 T36 = VFMA(LDK(KP707106781), T35, T32);
285 T3o = VFNMS(LDK(KP707106781), T35, T32);
286 T3g = VSUB(T34, T33);
287 T3h = VFMA(LDK(KP707106781), T3g, T3f);
288 T3r = VFNMS(LDK(KP707106781), T3g, T3f);
289 {
290 V T39, T3c, T3i, T3j;
291 T39 = VFNMS(LDK(KP414213562), T38, T37);
292 T3c = VFNMS(LDK(KP414213562), T3b, T3a);
293 T3d = VADD(T39, T3c);
294 T3s = VSUB(T3c, T39);
295 T3i = VFMA(LDK(KP414213562), T3a, T3b);
296 T3j = VFMA(LDK(KP414213562), T37, T38);
297 T3k = VSUB(T3i, T3j);
298 T3p = VADD(T3j, T3i);
299 }
300 {
301 V T3e, T3l, T3u, T3v;
302 T3e = VFNMS(LDK(KP923879532), T3d, T36);
303 T3l = VFNMS(LDK(KP923879532), T3k, T3h);
304 ST(&(x[WS(rs, 14)]), VFNMSI(T3l, T3e), ms, &(x[0]));
305 ST(&(x[WS(rs, 18)]), VFMAI(T3l, T3e), ms, &(x[0]));
306 T3u = VFMA(LDK(KP923879532), T3p, T3o);
307 T3v = VFNMS(LDK(KP923879532), T3s, T3r);
308 ST(&(x[WS(rs, 6)]), VFNMSI(T3v, T3u), ms, &(x[0]));
309 ST(&(x[WS(rs, 26)]), VFMAI(T3v, T3u), ms, &(x[0]));
310 }
311 {
312 V T3m, T3n, T3q, T3t;
313 T3m = VFMA(LDK(KP923879532), T3d, T36);
314 T3n = VFMA(LDK(KP923879532), T3k, T3h);
315 ST(&(x[WS(rs, 30)]), VFNMSI(T3n, T3m), ms, &(x[0]));
316 ST(&(x[WS(rs, 2)]), VFMAI(T3n, T3m), ms, &(x[0]));
317 T3q = VFNMS(LDK(KP923879532), T3p, T3o);
318 T3t = VFMA(LDK(KP923879532), T3s, T3r);
319 ST(&(x[WS(rs, 10)]), VFMAI(T3t, T3q), ms, &(x[0]));
320 ST(&(x[WS(rs, 22)]), VFNMSI(T3t, T3q), ms, &(x[0]));
321 }
322 }
323 {
324 V TE, T1M, T1I, T1N, T1t, T1Q, T1F, T1P;
325 {
326 V Tg, TD, T1G, T1H;
327 Tg = VFMA(LDK(KP707106781), Tf, T4);
328 TD = VADD(Tr, TC);
329 TE = VFMA(LDK(KP923879532), TD, Tg);
330 T1M = VFNMS(LDK(KP923879532), TD, Tg);
331 T1G = VFMA(LDK(KP198912367), TV, T12);
332 T1H = VFMA(LDK(KP198912367), T1k, T1r);
333 T1I = VSUB(T1G, T1H);
334 T1N = VADD(T1G, T1H);
335 }
336 {
337 V T13, T1s, T1B, T1E;
338 T13 = VFNMS(LDK(KP198912367), T12, TV);
339 T1s = VFNMS(LDK(KP198912367), T1r, T1k);
340 T1t = VADD(T13, T1s);
341 T1Q = VSUB(T1s, T13);
342 T1B = VFNMS(LDK(KP707106781), T1A, T1z);
343 T1E = VSUB(T1C, T1D);
344 T1F = VFMA(LDK(KP923879532), T1E, T1B);
345 T1P = VFNMS(LDK(KP923879532), T1E, T1B);
346 }
347 {
348 V T1u, T1J, T1S, T1T;
349 T1u = VFNMS(LDK(KP980785280), T1t, TE);
350 T1J = VFNMS(LDK(KP980785280), T1I, T1F);
351 ST(&(x[WS(rs, 17)]), VFNMSI(T1J, T1u), ms, &(x[WS(rs, 1)]));
352 ST(&(x[WS(rs, 15)]), VFMAI(T1J, T1u), ms, &(x[WS(rs, 1)]));
353 T1S = VFMA(LDK(KP980785280), T1N, T1M);
354 T1T = VFMA(LDK(KP980785280), T1Q, T1P);
355 ST(&(x[WS(rs, 7)]), VFMAI(T1T, T1S), ms, &(x[WS(rs, 1)]));
356 ST(&(x[WS(rs, 25)]), VFNMSI(T1T, T1S), ms, &(x[WS(rs, 1)]));
357 }
358 {
359 V T1K, T1L, T1O, T1R;
360 T1K = VFMA(LDK(KP980785280), T1t, TE);
361 T1L = VFMA(LDK(KP980785280), T1I, T1F);
362 ST(&(x[WS(rs, 1)]), VFNMSI(T1L, T1K), ms, &(x[WS(rs, 1)]));
363 ST(&(x[WS(rs, 31)]), VFMAI(T1L, T1K), ms, &(x[WS(rs, 1)]));
364 T1O = VFNMS(LDK(KP980785280), T1N, T1M);
365 T1R = VFNMS(LDK(KP980785280), T1Q, T1P);
366 ST(&(x[WS(rs, 9)]), VFNMSI(T1R, T1O), ms, &(x[WS(rs, 1)]));
367 ST(&(x[WS(rs, 23)]), VFMAI(T1R, T1O), ms, &(x[WS(rs, 1)]));
368 }
369 }
370 {
371 V T1W, T2e, T2a, T2f, T23, T2i, T27, T2h;
372 {
373 V T1U, T1V, T28, T29;
374 T1U = VFNMS(LDK(KP707106781), Tf, T4);
375 T1V = VADD(T1C, T1D);
376 T1W = VFMA(LDK(KP923879532), T1V, T1U);
377 T2e = VFNMS(LDK(KP923879532), T1V, T1U);
378 T28 = VFNMS(LDK(KP668178637), T1X, T1Y);
379 T29 = VFNMS(LDK(KP668178637), T20, T21);
380 T2a = VSUB(T28, T29);
381 T2f = VADD(T28, T29);
382 }
383 {
384 V T1Z, T22, T25, T26;
385 T1Z = VFMA(LDK(KP668178637), T1Y, T1X);
386 T22 = VFMA(LDK(KP668178637), T21, T20);
387 T23 = VADD(T1Z, T22);
388 T2i = VSUB(T22, T1Z);
389 T25 = VFMA(LDK(KP707106781), T1A, T1z);
390 T26 = VSUB(TC, Tr);
391 T27 = VFMA(LDK(KP923879532), T26, T25);
392 T2h = VFNMS(LDK(KP923879532), T26, T25);
393 }
394 {
395 V T24, T2b, T2k, T2l;
396 T24 = VFNMS(LDK(KP831469612), T23, T1W);
397 T2b = VFNMS(LDK(KP831469612), T2a, T27);
398 ST(&(x[WS(rs, 13)]), VFNMSI(T2b, T24), ms, &(x[WS(rs, 1)]));
399 ST(&(x[WS(rs, 19)]), VFMAI(T2b, T24), ms, &(x[WS(rs, 1)]));
400 T2k = VFNMS(LDK(KP831469612), T2f, T2e);
401 T2l = VFNMS(LDK(KP831469612), T2i, T2h);
402 ST(&(x[WS(rs, 5)]), VFNMSI(T2l, T2k), ms, &(x[WS(rs, 1)]));
403 ST(&(x[WS(rs, 27)]), VFMAI(T2l, T2k), ms, &(x[WS(rs, 1)]));
404 }
405 {
406 V T2c, T2d, T2g, T2j;
407 T2c = VFMA(LDK(KP831469612), T23, T1W);
408 T2d = VFMA(LDK(KP831469612), T2a, T27);
409 ST(&(x[WS(rs, 29)]), VFNMSI(T2d, T2c), ms, &(x[WS(rs, 1)]));
410 ST(&(x[WS(rs, 3)]), VFMAI(T2d, T2c), ms, &(x[WS(rs, 1)]));
411 T2g = VFMA(LDK(KP831469612), T2f, T2e);
412 T2j = VFMA(LDK(KP831469612), T2i, T2h);
413 ST(&(x[WS(rs, 11)]), VFMAI(T2j, T2g), ms, &(x[WS(rs, 1)]));
414 ST(&(x[WS(rs, 21)]), VFNMSI(T2j, T2g), ms, &(x[WS(rs, 1)]));
415 }
416 }
417 }
418 }
419 VLEAVE();
420 }
421
422 static const tw_instr twinstr[] = {
423 VTW(0, 1),
424 VTW(0, 2),
425 VTW(0, 3),
426 VTW(0, 4),
427 VTW(0, 5),
428 VTW(0, 6),
429 VTW(0, 7),
430 VTW(0, 8),
431 VTW(0, 9),
432 VTW(0, 10),
433 VTW(0, 11),
434 VTW(0, 12),
435 VTW(0, 13),
436 VTW(0, 14),
437 VTW(0, 15),
438 VTW(0, 16),
439 VTW(0, 17),
440 VTW(0, 18),
441 VTW(0, 19),
442 VTW(0, 20),
443 VTW(0, 21),
444 VTW(0, 22),
445 VTW(0, 23),
446 VTW(0, 24),
447 VTW(0, 25),
448 VTW(0, 26),
449 VTW(0, 27),
450 VTW(0, 28),
451 VTW(0, 29),
452 VTW(0, 30),
453 VTW(0, 31),
454 {TW_NEXT, VL, 0}
455 };
456
457 static const ct_desc desc = { 32, XSIMD_STRING("t2fv_32"), twinstr, &GENUS, {119, 62, 98, 0}, 0, 0, 0 };
458
459 void XSIMD(codelet_t2fv_32) (planner *p) {
460 X(kdft_dit_register) (p, t2fv_32, &desc);
461 }
462 #else
463
464 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2fv_32 -include dft/simd/t2f.h */
465
466 /*
467 * This function contains 217 FP additions, 104 FP multiplications,
468 * (or, 201 additions, 88 multiplications, 16 fused multiply/add),
469 * 59 stack variables, 7 constants, and 64 memory accesses
470 */
471 #include "dft/simd/t2f.h"
472
473 static void t2fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
474 {
475 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
476 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
477 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
478 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
479 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
480 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
481 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
482 {
483 INT m;
484 R *x;
485 x = ri;
486 for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
487 V T4, T1A, T2o, T32, Tf, T1v, T2r, T3f, TC, T1C, T2L, T34, Tr, T1D, T2O;
488 V T33, T1k, T20, T2F, T3b, T1r, T21, T2C, T3a, TV, T1X, T2y, T38, T12, T1Y;
489 V T2v, T37;
490 {
491 V T1, T1z, T3, T1x, T1y, T2, T1w, T2m, T2n;
492 T1 = LD(&(x[0]), ms, &(x[0]));
493 T1y = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
494 T1z = BYTWJ(&(W[TWVL * 46]), T1y);
495 T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
496 T3 = BYTWJ(&(W[TWVL * 30]), T2);
497 T1w = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
498 T1x = BYTWJ(&(W[TWVL * 14]), T1w);
499 T4 = VSUB(T1, T3);
500 T1A = VSUB(T1x, T1z);
501 T2m = VADD(T1, T3);
502 T2n = VADD(T1x, T1z);
503 T2o = VADD(T2m, T2n);
504 T32 = VSUB(T2m, T2n);
505 }
506 {
507 V T6, Td, T8, Tb;
508 {
509 V T5, Tc, T7, Ta;
510 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
511 T6 = BYTWJ(&(W[TWVL * 6]), T5);
512 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
513 Td = BYTWJ(&(W[TWVL * 22]), Tc);
514 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
515 T8 = BYTWJ(&(W[TWVL * 38]), T7);
516 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
517 Tb = BYTWJ(&(W[TWVL * 54]), Ta);
518 }
519 {
520 V T9, Te, T2p, T2q;
521 T9 = VSUB(T6, T8);
522 Te = VSUB(Tb, Td);
523 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
524 T1v = VMUL(LDK(KP707106781), VSUB(Te, T9));
525 T2p = VADD(T6, T8);
526 T2q = VADD(Tb, Td);
527 T2r = VADD(T2p, T2q);
528 T3f = VSUB(T2q, T2p);
529 }
530 }
531 {
532 V Tt, TA, Tv, Ty;
533 {
534 V Ts, Tz, Tu, Tx;
535 Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
536 Tt = BYTWJ(&(W[TWVL * 58]), Ts);
537 Tz = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
538 TA = BYTWJ(&(W[TWVL * 42]), Tz);
539 Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
540 Tv = BYTWJ(&(W[TWVL * 26]), Tu);
541 Tx = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
542 Ty = BYTWJ(&(W[TWVL * 10]), Tx);
543 }
544 {
545 V Tw, TB, T2J, T2K;
546 Tw = VSUB(Tt, Tv);
547 TB = VSUB(Ty, TA);
548 TC = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), TB));
549 T1C = VFNMS(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
550 T2J = VADD(Tt, Tv);
551 T2K = VADD(Ty, TA);
552 T2L = VADD(T2J, T2K);
553 T34 = VSUB(T2J, T2K);
554 }
555 }
556 {
557 V Ti, Tp, Tk, Tn;
558 {
559 V Th, To, Tj, Tm;
560 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
561 Ti = BYTWJ(&(W[TWVL * 2]), Th);
562 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
563 Tp = BYTWJ(&(W[TWVL * 50]), To);
564 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
565 Tk = BYTWJ(&(W[TWVL * 34]), Tj);
566 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
567 Tn = BYTWJ(&(W[TWVL * 18]), Tm);
568 }
569 {
570 V Tl, Tq, T2M, T2N;
571 Tl = VSUB(Ti, Tk);
572 Tq = VSUB(Tn, Tp);
573 Tr = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
574 T1D = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
575 T2M = VADD(Ti, Tk);
576 T2N = VADD(Tn, Tp);
577 T2O = VADD(T2M, T2N);
578 T33 = VSUB(T2M, T2N);
579 }
580 }
581 {
582 V T15, T17, T1p, T1n, T1f, T1h, T1i, T1a, T1c, T1d;
583 {
584 V T14, T16, T1o, T1m;
585 T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
586 T15 = BYTWJ(&(W[TWVL * 60]), T14);
587 T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
588 T17 = BYTWJ(&(W[TWVL * 28]), T16);
589 T1o = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
590 T1p = BYTWJ(&(W[TWVL * 44]), T1o);
591 T1m = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
592 T1n = BYTWJ(&(W[TWVL * 12]), T1m);
593 {
594 V T1e, T1g, T19, T1b;
595 T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
596 T1f = BYTWJ(&(W[TWVL * 52]), T1e);
597 T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
598 T1h = BYTWJ(&(W[TWVL * 20]), T1g);
599 T1i = VSUB(T1f, T1h);
600 T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
601 T1a = BYTWJ(&(W[TWVL * 4]), T19);
602 T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
603 T1c = BYTWJ(&(W[TWVL * 36]), T1b);
604 T1d = VSUB(T1a, T1c);
605 }
606 }
607 {
608 V T18, T1j, T2D, T2E;
609 T18 = VSUB(T15, T17);
610 T1j = VMUL(LDK(KP707106781), VADD(T1d, T1i));
611 T1k = VADD(T18, T1j);
612 T20 = VSUB(T18, T1j);
613 T2D = VADD(T1a, T1c);
614 T2E = VADD(T1f, T1h);
615 T2F = VADD(T2D, T2E);
616 T3b = VSUB(T2E, T2D);
617 }
618 {
619 V T1l, T1q, T2A, T2B;
620 T1l = VMUL(LDK(KP707106781), VSUB(T1i, T1d));
621 T1q = VSUB(T1n, T1p);
622 T1r = VSUB(T1l, T1q);
623 T21 = VADD(T1q, T1l);
624 T2A = VADD(T15, T17);
625 T2B = VADD(T1n, T1p);
626 T2C = VADD(T2A, T2B);
627 T3a = VSUB(T2A, T2B);
628 }
629 }
630 {
631 V TG, TI, T10, TY, TQ, TS, TT, TL, TN, TO;
632 {
633 V TF, TH, TZ, TX;
634 TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
635 TG = BYTWJ(&(W[0]), TF);
636 TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
637 TI = BYTWJ(&(W[TWVL * 32]), TH);
638 TZ = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
639 T10 = BYTWJ(&(W[TWVL * 48]), TZ);
640 TX = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
641 TY = BYTWJ(&(W[TWVL * 16]), TX);
642 {
643 V TP, TR, TK, TM;
644 TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
645 TQ = BYTWJ(&(W[TWVL * 56]), TP);
646 TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
647 TS = BYTWJ(&(W[TWVL * 24]), TR);
648 TT = VSUB(TQ, TS);
649 TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
650 TL = BYTWJ(&(W[TWVL * 8]), TK);
651 TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
652 TN = BYTWJ(&(W[TWVL * 40]), TM);
653 TO = VSUB(TL, TN);
654 }
655 }
656 {
657 V TJ, TU, T2w, T2x;
658 TJ = VSUB(TG, TI);
659 TU = VMUL(LDK(KP707106781), VADD(TO, TT));
660 TV = VADD(TJ, TU);
661 T1X = VSUB(TJ, TU);
662 T2w = VADD(TL, TN);
663 T2x = VADD(TQ, TS);
664 T2y = VADD(T2w, T2x);
665 T38 = VSUB(T2x, T2w);
666 }
667 {
668 V TW, T11, T2t, T2u;
669 TW = VMUL(LDK(KP707106781), VSUB(TT, TO));
670 T11 = VSUB(TY, T10);
671 T12 = VSUB(TW, T11);
672 T1Y = VADD(T11, TW);
673 T2t = VADD(TG, TI);
674 T2u = VADD(TY, T10);
675 T2v = VADD(T2t, T2u);
676 T37 = VSUB(T2t, T2u);
677 }
678 }
679 {
680 V T2W, T30, T2Z, T31;
681 {
682 V T2U, T2V, T2X, T2Y;
683 T2U = VADD(T2o, T2r);
684 T2V = VADD(T2O, T2L);
685 T2W = VADD(T2U, T2V);
686 T30 = VSUB(T2U, T2V);
687 T2X = VADD(T2v, T2y);
688 T2Y = VADD(T2C, T2F);
689 T2Z = VADD(T2X, T2Y);
690 T31 = VBYI(VSUB(T2Y, T2X));
691 }
692 ST(&(x[WS(rs, 16)]), VSUB(T2W, T2Z), ms, &(x[0]));
693 ST(&(x[WS(rs, 8)]), VADD(T30, T31), ms, &(x[0]));
694 ST(&(x[0]), VADD(T2W, T2Z), ms, &(x[0]));
695 ST(&(x[WS(rs, 24)]), VSUB(T30, T31), ms, &(x[0]));
696 }
697 {
698 V T2s, T2P, T2H, T2Q, T2z, T2G;
699 T2s = VSUB(T2o, T2r);
700 T2P = VSUB(T2L, T2O);
701 T2z = VSUB(T2v, T2y);
702 T2G = VSUB(T2C, T2F);
703 T2H = VMUL(LDK(KP707106781), VADD(T2z, T2G));
704 T2Q = VMUL(LDK(KP707106781), VSUB(T2G, T2z));
705 {
706 V T2I, T2R, T2S, T2T;
707 T2I = VADD(T2s, T2H);
708 T2R = VBYI(VADD(T2P, T2Q));
709 ST(&(x[WS(rs, 28)]), VSUB(T2I, T2R), ms, &(x[0]));
710 ST(&(x[WS(rs, 4)]), VADD(T2I, T2R), ms, &(x[0]));
711 T2S = VSUB(T2s, T2H);
712 T2T = VBYI(VSUB(T2Q, T2P));
713 ST(&(x[WS(rs, 20)]), VSUB(T2S, T2T), ms, &(x[0]));
714 ST(&(x[WS(rs, 12)]), VADD(T2S, T2T), ms, &(x[0]));
715 }
716 }
717 {
718 V T36, T3r, T3h, T3p, T3d, T3o, T3k, T3s, T35, T3g;
719 T35 = VMUL(LDK(KP707106781), VADD(T33, T34));
720 T36 = VADD(T32, T35);
721 T3r = VSUB(T32, T35);
722 T3g = VMUL(LDK(KP707106781), VSUB(T34, T33));
723 T3h = VADD(T3f, T3g);
724 T3p = VSUB(T3g, T3f);
725 {
726 V T39, T3c, T3i, T3j;
727 T39 = VFMA(LDK(KP923879532), T37, VMUL(LDK(KP382683432), T38));
728 T3c = VFNMS(LDK(KP382683432), T3b, VMUL(LDK(KP923879532), T3a));
729 T3d = VADD(T39, T3c);
730 T3o = VSUB(T3c, T39);
731 T3i = VFNMS(LDK(KP382683432), T37, VMUL(LDK(KP923879532), T38));
732 T3j = VFMA(LDK(KP382683432), T3a, VMUL(LDK(KP923879532), T3b));
733 T3k = VADD(T3i, T3j);
734 T3s = VSUB(T3j, T3i);
735 }
736 {
737 V T3e, T3l, T3u, T3v;
738 T3e = VADD(T36, T3d);
739 T3l = VBYI(VADD(T3h, T3k));
740 ST(&(x[WS(rs, 30)]), VSUB(T3e, T3l), ms, &(x[0]));
741 ST(&(x[WS(rs, 2)]), VADD(T3e, T3l), ms, &(x[0]));
742 T3u = VBYI(VADD(T3p, T3o));
743 T3v = VADD(T3r, T3s);
744 ST(&(x[WS(rs, 6)]), VADD(T3u, T3v), ms, &(x[0]));
745 ST(&(x[WS(rs, 26)]), VSUB(T3v, T3u), ms, &(x[0]));
746 }
747 {
748 V T3m, T3n, T3q, T3t;
749 T3m = VSUB(T36, T3d);
750 T3n = VBYI(VSUB(T3k, T3h));
751 ST(&(x[WS(rs, 18)]), VSUB(T3m, T3n), ms, &(x[0]));
752 ST(&(x[WS(rs, 14)]), VADD(T3m, T3n), ms, &(x[0]));
753 T3q = VBYI(VSUB(T3o, T3p));
754 T3t = VSUB(T3r, T3s);
755 ST(&(x[WS(rs, 10)]), VADD(T3q, T3t), ms, &(x[0]));
756 ST(&(x[WS(rs, 22)]), VSUB(T3t, T3q), ms, &(x[0]));
757 }
758 }
759 {
760 V TE, T1P, T1I, T1Q, T1t, T1M, T1F, T1N;
761 {
762 V Tg, TD, T1G, T1H;
763 Tg = VADD(T4, Tf);
764 TD = VADD(Tr, TC);
765 TE = VADD(Tg, TD);
766 T1P = VSUB(Tg, TD);
767 T1G = VFNMS(LDK(KP195090322), TV, VMUL(LDK(KP980785280), T12));
768 T1H = VFMA(LDK(KP195090322), T1k, VMUL(LDK(KP980785280), T1r));
769 T1I = VADD(T1G, T1H);
770 T1Q = VSUB(T1H, T1G);
771 }
772 {
773 V T13, T1s, T1B, T1E;
774 T13 = VFMA(LDK(KP980785280), TV, VMUL(LDK(KP195090322), T12));
775 T1s = VFNMS(LDK(KP195090322), T1r, VMUL(LDK(KP980785280), T1k));
776 T1t = VADD(T13, T1s);
777 T1M = VSUB(T1s, T13);
778 T1B = VSUB(T1v, T1A);
779 T1E = VSUB(T1C, T1D);
780 T1F = VADD(T1B, T1E);
781 T1N = VSUB(T1E, T1B);
782 }
783 {
784 V T1u, T1J, T1S, T1T;
785 T1u = VADD(TE, T1t);
786 T1J = VBYI(VADD(T1F, T1I));
787 ST(&(x[WS(rs, 31)]), VSUB(T1u, T1J), ms, &(x[WS(rs, 1)]));
788 ST(&(x[WS(rs, 1)]), VADD(T1u, T1J), ms, &(x[WS(rs, 1)]));
789 T1S = VBYI(VADD(T1N, T1M));
790 T1T = VADD(T1P, T1Q);
791 ST(&(x[WS(rs, 7)]), VADD(T1S, T1T), ms, &(x[WS(rs, 1)]));
792 ST(&(x[WS(rs, 25)]), VSUB(T1T, T1S), ms, &(x[WS(rs, 1)]));
793 }
794 {
795 V T1K, T1L, T1O, T1R;
796 T1K = VSUB(TE, T1t);
797 T1L = VBYI(VSUB(T1I, T1F));
798 ST(&(x[WS(rs, 17)]), VSUB(T1K, T1L), ms, &(x[WS(rs, 1)]));
799 ST(&(x[WS(rs, 15)]), VADD(T1K, T1L), ms, &(x[WS(rs, 1)]));
800 T1O = VBYI(VSUB(T1M, T1N));
801 T1R = VSUB(T1P, T1Q);
802 ST(&(x[WS(rs, 9)]), VADD(T1O, T1R), ms, &(x[WS(rs, 1)]));
803 ST(&(x[WS(rs, 23)]), VSUB(T1R, T1O), ms, &(x[WS(rs, 1)]));
804 }
805 }
806 {
807 V T1W, T2h, T2a, T2i, T23, T2e, T27, T2f;
808 {
809 V T1U, T1V, T28, T29;
810 T1U = VSUB(T4, Tf);
811 T1V = VADD(T1D, T1C);
812 T1W = VADD(T1U, T1V);
813 T2h = VSUB(T1U, T1V);
814 T28 = VFNMS(LDK(KP555570233), T1X, VMUL(LDK(KP831469612), T1Y));
815 T29 = VFMA(LDK(KP555570233), T20, VMUL(LDK(KP831469612), T21));
816 T2a = VADD(T28, T29);
817 T2i = VSUB(T29, T28);
818 }
819 {
820 V T1Z, T22, T25, T26;
821 T1Z = VFMA(LDK(KP831469612), T1X, VMUL(LDK(KP555570233), T1Y));
822 T22 = VFNMS(LDK(KP555570233), T21, VMUL(LDK(KP831469612), T20));
823 T23 = VADD(T1Z, T22);
824 T2e = VSUB(T22, T1Z);
825 T25 = VADD(T1A, T1v);
826 T26 = VSUB(TC, Tr);
827 T27 = VADD(T25, T26);
828 T2f = VSUB(T26, T25);
829 }
830 {
831 V T24, T2b, T2k, T2l;
832 T24 = VADD(T1W, T23);
833 T2b = VBYI(VADD(T27, T2a));
834 ST(&(x[WS(rs, 29)]), VSUB(T24, T2b), ms, &(x[WS(rs, 1)]));
835 ST(&(x[WS(rs, 3)]), VADD(T24, T2b), ms, &(x[WS(rs, 1)]));
836 T2k = VBYI(VADD(T2f, T2e));
837 T2l = VADD(T2h, T2i);
838 ST(&(x[WS(rs, 5)]), VADD(T2k, T2l), ms, &(x[WS(rs, 1)]));
839 ST(&(x[WS(rs, 27)]), VSUB(T2l, T2k), ms, &(x[WS(rs, 1)]));
840 }
841 {
842 V T2c, T2d, T2g, T2j;
843 T2c = VSUB(T1W, T23);
844 T2d = VBYI(VSUB(T2a, T27));
845 ST(&(x[WS(rs, 19)]), VSUB(T2c, T2d), ms, &(x[WS(rs, 1)]));
846 ST(&(x[WS(rs, 13)]), VADD(T2c, T2d), ms, &(x[WS(rs, 1)]));
847 T2g = VBYI(VSUB(T2e, T2f));
848 T2j = VSUB(T2h, T2i);
849 ST(&(x[WS(rs, 11)]), VADD(T2g, T2j), ms, &(x[WS(rs, 1)]));
850 ST(&(x[WS(rs, 21)]), VSUB(T2j, T2g), ms, &(x[WS(rs, 1)]));
851 }
852 }
853 }
854 }
855 VLEAVE();
856 }
857
858 static const tw_instr twinstr[] = {
859 VTW(0, 1),
860 VTW(0, 2),
861 VTW(0, 3),
862 VTW(0, 4),
863 VTW(0, 5),
864 VTW(0, 6),
865 VTW(0, 7),
866 VTW(0, 8),
867 VTW(0, 9),
868 VTW(0, 10),
869 VTW(0, 11),
870 VTW(0, 12),
871 VTW(0, 13),
872 VTW(0, 14),
873 VTW(0, 15),
874 VTW(0, 16),
875 VTW(0, 17),
876 VTW(0, 18),
877 VTW(0, 19),
878 VTW(0, 20),
879 VTW(0, 21),
880 VTW(0, 22),
881 VTW(0, 23),
882 VTW(0, 24),
883 VTW(0, 25),
884 VTW(0, 26),
885 VTW(0, 27),
886 VTW(0, 28),
887 VTW(0, 29),
888 VTW(0, 30),
889 VTW(0, 31),
890 {TW_NEXT, VL, 0}
891 };
892
893 static const ct_desc desc = { 32, XSIMD_STRING("t2fv_32"), twinstr, &GENUS, {201, 88, 16, 0}, 0, 0, 0 };
894
895 void XSIMD(codelet_t2fv_32) (planner *p) {
896 X(kdft_dit_register) (p, t2fv_32, &desc);
897 }
898 #endif