comparison src/fftw-3.3.3/dft/simd/common/t2fv_32.c @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Sun Nov 25 07:38:36 EST 2012 */
23
24 #include "codelet-dft.h"
25
26 #ifdef HAVE_FMA
27
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2fv_32 -include t2f.h */
29
30 /*
31 * This function contains 217 FP additions, 160 FP multiplications,
32 * (or, 119 additions, 62 multiplications, 98 fused multiply/add),
33 * 112 stack variables, 7 constants, and 64 memory accesses
34 */
35 #include "t2f.h"
36
37 static void t2fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
40 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
41 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
44 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
46 {
47 INT m;
48 R *x;
49 x = ri;
50 for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
51 V T26, T25, T1Z, T22, T1W, T2a, T2k, T2g;
52 {
53 V T4, T1z, T2o, T32, T2r, T3f, Tf, T1A, T34, T2L, T1D, TC, T33, T2O, T1C;
54 V Tr, T2C, T3a, T2F, T3b, T1r, T21, T1k, T20, TQ, TM, TS, TL, T2t, TJ;
55 V T10, T2u;
56 {
57 V Tt, T9, T2p, Te, T2q, TA, Tu, Tx;
58 {
59 V T1, T1x, T2, T1v;
60 T1 = LD(&(x[0]), ms, &(x[0]));
61 T1x = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
62 T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
63 T1v = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
64 {
65 V T5, Tc, T7, Ta, T2m, T2n;
66 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
67 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
68 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
69 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
70 {
71 V T1y, T3, T1w, T6, Td, T8, Tb, Ts, Tz;
72 Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
73 T1y = BYTWJ(&(W[TWVL * 46]), T1x);
74 T3 = BYTWJ(&(W[TWVL * 30]), T2);
75 T1w = BYTWJ(&(W[TWVL * 14]), T1v);
76 T6 = BYTWJ(&(W[TWVL * 6]), T5);
77 Td = BYTWJ(&(W[TWVL * 22]), Tc);
78 T8 = BYTWJ(&(W[TWVL * 38]), T7);
79 Tb = BYTWJ(&(W[TWVL * 54]), Ta);
80 Tt = BYTWJ(&(W[TWVL * 58]), Ts);
81 Tz = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
82 T4 = VSUB(T1, T3);
83 T2m = VADD(T1, T3);
84 T1z = VSUB(T1w, T1y);
85 T2n = VADD(T1w, T1y);
86 T9 = VSUB(T6, T8);
87 T2p = VADD(T6, T8);
88 Te = VSUB(Tb, Td);
89 T2q = VADD(Tb, Td);
90 TA = BYTWJ(&(W[TWVL * 10]), Tz);
91 }
92 Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
93 T2o = VADD(T2m, T2n);
94 T32 = VSUB(T2m, T2n);
95 Tx = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
96 }
97 }
98 {
99 V Tv, To, Ty, Ti, Tj, Tm, Th;
100 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
101 T2r = VADD(T2p, T2q);
102 T3f = VSUB(T2q, T2p);
103 Tf = VADD(T9, Te);
104 T1A = VSUB(Te, T9);
105 Tv = BYTWJ(&(W[TWVL * 26]), Tu);
106 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
107 Ty = BYTWJ(&(W[TWVL * 42]), Tx);
108 Ti = BYTWJ(&(W[TWVL * 2]), Th);
109 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
110 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
111 {
112 V T1f, T1h, T1a, T1c, T18, T2A, T2B, T1p;
113 {
114 V T15, T17, T1o, T1m;
115 {
116 V Tw, T2J, Tp, T2K, TB, Tk, Tn, T1n, T14, T16;
117 T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
118 T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
119 Tw = VSUB(Tt, Tv);
120 T2J = VADD(Tt, Tv);
121 Tp = BYTWJ(&(W[TWVL * 50]), To);
122 T2K = VADD(TA, Ty);
123 TB = VSUB(Ty, TA);
124 Tk = BYTWJ(&(W[TWVL * 34]), Tj);
125 Tn = BYTWJ(&(W[TWVL * 18]), Tm);
126 T15 = BYTWJ(&(W[TWVL * 60]), T14);
127 T17 = BYTWJ(&(W[TWVL * 28]), T16);
128 T1n = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
129 {
130 V T2M, Tl, T2N, Tq, T1l;
131 T1l = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
132 T34 = VSUB(T2J, T2K);
133 T2L = VADD(T2J, T2K);
134 T1D = VFMA(LDK(KP414213562), Tw, TB);
135 TC = VFNMS(LDK(KP414213562), TB, Tw);
136 T2M = VADD(Ti, Tk);
137 Tl = VSUB(Ti, Tk);
138 T2N = VADD(Tn, Tp);
139 Tq = VSUB(Tn, Tp);
140 T1o = BYTWJ(&(W[TWVL * 12]), T1n);
141 T1m = BYTWJ(&(W[TWVL * 44]), T1l);
142 {
143 V T1e, T1g, T19, T1b;
144 T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
145 T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
146 T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
147 T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
148 T33 = VSUB(T2M, T2N);
149 T2O = VADD(T2M, T2N);
150 T1C = VFMA(LDK(KP414213562), Tl, Tq);
151 Tr = VFNMS(LDK(KP414213562), Tq, Tl);
152 T1f = BYTWJ(&(W[TWVL * 52]), T1e);
153 T1h = BYTWJ(&(W[TWVL * 20]), T1g);
154 T1a = BYTWJ(&(W[TWVL * 4]), T19);
155 T1c = BYTWJ(&(W[TWVL * 36]), T1b);
156 }
157 }
158 }
159 T18 = VSUB(T15, T17);
160 T2A = VADD(T15, T17);
161 T2B = VADD(T1o, T1m);
162 T1p = VSUB(T1m, T1o);
163 }
164 {
165 V TG, TI, TZ, TX;
166 {
167 V T1i, T2E, T1d, T2D, TH, TY, TF;
168 TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
169 T1i = VSUB(T1f, T1h);
170 T2E = VADD(T1f, T1h);
171 T1d = VSUB(T1a, T1c);
172 T2D = VADD(T1a, T1c);
173 TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
174 TY = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
175 T2C = VADD(T2A, T2B);
176 T3a = VSUB(T2A, T2B);
177 TG = BYTWJ(&(W[0]), TF);
178 {
179 V TW, T1j, T1q, TP, TR, TK;
180 TW = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
181 T2F = VADD(T2D, T2E);
182 T3b = VSUB(T2E, T2D);
183 T1j = VADD(T1d, T1i);
184 T1q = VSUB(T1i, T1d);
185 TI = BYTWJ(&(W[TWVL * 32]), TH);
186 TZ = BYTWJ(&(W[TWVL * 48]), TY);
187 TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
188 TX = BYTWJ(&(W[TWVL * 16]), TW);
189 TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
190 TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
191 T1r = VFMA(LDK(KP707106781), T1q, T1p);
192 T21 = VFNMS(LDK(KP707106781), T1q, T1p);
193 T1k = VFMA(LDK(KP707106781), T1j, T18);
194 T20 = VFNMS(LDK(KP707106781), T1j, T18);
195 TQ = BYTWJ(&(W[TWVL * 56]), TP);
196 TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
197 TS = BYTWJ(&(W[TWVL * 24]), TR);
198 TL = BYTWJ(&(W[TWVL * 8]), TK);
199 }
200 }
201 T2t = VADD(TG, TI);
202 TJ = VSUB(TG, TI);
203 T10 = VSUB(TX, TZ);
204 T2u = VADD(TX, TZ);
205 }
206 }
207 }
208 }
209 {
210 V T2s, TT, T2x, T2P, T2Y, T2G, T37, T2v, T2w, TO, T2W, T30, T2U, TN, T2V;
211 T2s = VSUB(T2o, T2r);
212 T2U = VADD(T2o, T2r);
213 TN = BYTWJ(&(W[TWVL * 40]), TM);
214 TT = VSUB(TQ, TS);
215 T2x = VADD(TQ, TS);
216 T2P = VSUB(T2L, T2O);
217 T2V = VADD(T2O, T2L);
218 T2Y = VADD(T2C, T2F);
219 T2G = VSUB(T2C, T2F);
220 T37 = VSUB(T2t, T2u);
221 T2v = VADD(T2t, T2u);
222 T2w = VADD(TL, TN);
223 TO = VSUB(TL, TN);
224 T2W = VADD(T2U, T2V);
225 T30 = VSUB(T2U, T2V);
226 {
227 V T3i, T3o, T36, T3r, T3h, T3j, T12, T1Y, TV, T1X, T3s, T3d, T2Q, T2H, T31;
228 V T2Z;
229 {
230 V T35, T3g, T38, T2y, T11, TU;
231 T35 = VADD(T33, T34);
232 T3g = VSUB(T34, T33);
233 T38 = VSUB(T2w, T2x);
234 T2y = VADD(T2w, T2x);
235 T11 = VSUB(TO, TT);
236 TU = VADD(TO, TT);
237 {
238 V T3c, T39, T2X, T2z;
239 T3c = VFNMS(LDK(KP414213562), T3b, T3a);
240 T3i = VFMA(LDK(KP414213562), T3a, T3b);
241 T3o = VFNMS(LDK(KP707106781), T35, T32);
242 T36 = VFMA(LDK(KP707106781), T35, T32);
243 T3r = VFNMS(LDK(KP707106781), T3g, T3f);
244 T3h = VFMA(LDK(KP707106781), T3g, T3f);
245 T39 = VFNMS(LDK(KP414213562), T38, T37);
246 T3j = VFMA(LDK(KP414213562), T37, T38);
247 T2X = VADD(T2v, T2y);
248 T2z = VSUB(T2v, T2y);
249 T12 = VFMA(LDK(KP707106781), T11, T10);
250 T1Y = VFNMS(LDK(KP707106781), T11, T10);
251 TV = VFMA(LDK(KP707106781), TU, TJ);
252 T1X = VFNMS(LDK(KP707106781), TU, TJ);
253 T3s = VSUB(T3c, T39);
254 T3d = VADD(T39, T3c);
255 T2Q = VSUB(T2G, T2z);
256 T2H = VADD(T2z, T2G);
257 T31 = VSUB(T2Y, T2X);
258 T2Z = VADD(T2X, T2Y);
259 }
260 }
261 {
262 V Tg, T1U, TD, T1G, T13, T1s, T1H, T1B, T1V, T1E, T3k, T3p, T2e, T2f;
263 Tg = VFMA(LDK(KP707106781), Tf, T4);
264 T1U = VFNMS(LDK(KP707106781), Tf, T4);
265 T3k = VSUB(T3i, T3j);
266 T3p = VADD(T3j, T3i);
267 {
268 V T3v, T3t, T3e, T3m;
269 T3v = VFNMS(LDK(KP923879532), T3s, T3r);
270 T3t = VFMA(LDK(KP923879532), T3s, T3r);
271 T3e = VFNMS(LDK(KP923879532), T3d, T36);
272 T3m = VFMA(LDK(KP923879532), T3d, T36);
273 {
274 V T2R, T2T, T2I, T2S;
275 T2R = VFNMS(LDK(KP707106781), T2Q, T2P);
276 T2T = VFMA(LDK(KP707106781), T2Q, T2P);
277 T2I = VFNMS(LDK(KP707106781), T2H, T2s);
278 T2S = VFMA(LDK(KP707106781), T2H, T2s);
279 ST(&(x[WS(rs, 24)]), VFNMSI(T31, T30), ms, &(x[0]));
280 ST(&(x[WS(rs, 8)]), VFMAI(T31, T30), ms, &(x[0]));
281 ST(&(x[0]), VADD(T2W, T2Z), ms, &(x[0]));
282 ST(&(x[WS(rs, 16)]), VSUB(T2W, T2Z), ms, &(x[0]));
283 {
284 V T3u, T3q, T3l, T3n;
285 T3u = VFMA(LDK(KP923879532), T3p, T3o);
286 T3q = VFNMS(LDK(KP923879532), T3p, T3o);
287 T3l = VFNMS(LDK(KP923879532), T3k, T3h);
288 T3n = VFMA(LDK(KP923879532), T3k, T3h);
289 ST(&(x[WS(rs, 4)]), VFMAI(T2T, T2S), ms, &(x[0]));
290 ST(&(x[WS(rs, 28)]), VFNMSI(T2T, T2S), ms, &(x[0]));
291 ST(&(x[WS(rs, 20)]), VFMAI(T2R, T2I), ms, &(x[0]));
292 ST(&(x[WS(rs, 12)]), VFNMSI(T2R, T2I), ms, &(x[0]));
293 ST(&(x[WS(rs, 22)]), VFNMSI(T3t, T3q), ms, &(x[0]));
294 ST(&(x[WS(rs, 10)]), VFMAI(T3t, T3q), ms, &(x[0]));
295 ST(&(x[WS(rs, 26)]), VFMAI(T3v, T3u), ms, &(x[0]));
296 ST(&(x[WS(rs, 6)]), VFNMSI(T3v, T3u), ms, &(x[0]));
297 ST(&(x[WS(rs, 2)]), VFMAI(T3n, T3m), ms, &(x[0]));
298 ST(&(x[WS(rs, 30)]), VFNMSI(T3n, T3m), ms, &(x[0]));
299 ST(&(x[WS(rs, 18)]), VFMAI(T3l, T3e), ms, &(x[0]));
300 ST(&(x[WS(rs, 14)]), VFNMSI(T3l, T3e), ms, &(x[0]));
301 T26 = VSUB(TC, Tr);
302 TD = VADD(Tr, TC);
303 }
304 }
305 }
306 T1G = VFMA(LDK(KP198912367), TV, T12);
307 T13 = VFNMS(LDK(KP198912367), T12, TV);
308 T1s = VFNMS(LDK(KP198912367), T1r, T1k);
309 T1H = VFMA(LDK(KP198912367), T1k, T1r);
310 T1B = VFNMS(LDK(KP707106781), T1A, T1z);
311 T25 = VFMA(LDK(KP707106781), T1A, T1z);
312 T1V = VADD(T1C, T1D);
313 T1E = VSUB(T1C, T1D);
314 {
315 V T1S, T1O, T1K, T1u, T1R, T1T, T1L, T1J;
316 {
317 V TE, T1M, T1I, T1N, T1t, T1Q, T1F, T1P, T28, T29;
318 TE = VFMA(LDK(KP923879532), TD, Tg);
319 T1M = VFNMS(LDK(KP923879532), TD, Tg);
320 T1I = VSUB(T1G, T1H);
321 T1N = VADD(T1G, T1H);
322 T1t = VADD(T13, T1s);
323 T1Q = VSUB(T1s, T13);
324 T1F = VFMA(LDK(KP923879532), T1E, T1B);
325 T1P = VFNMS(LDK(KP923879532), T1E, T1B);
326 T28 = VFNMS(LDK(KP668178637), T1X, T1Y);
327 T1Z = VFMA(LDK(KP668178637), T1Y, T1X);
328 T1S = VFMA(LDK(KP980785280), T1N, T1M);
329 T1O = VFNMS(LDK(KP980785280), T1N, T1M);
330 T22 = VFMA(LDK(KP668178637), T21, T20);
331 T29 = VFNMS(LDK(KP668178637), T20, T21);
332 T1K = VFMA(LDK(KP980785280), T1t, TE);
333 T1u = VFNMS(LDK(KP980785280), T1t, TE);
334 T1R = VFNMS(LDK(KP980785280), T1Q, T1P);
335 T1T = VFMA(LDK(KP980785280), T1Q, T1P);
336 T1L = VFMA(LDK(KP980785280), T1I, T1F);
337 T1J = VFNMS(LDK(KP980785280), T1I, T1F);
338 T2e = VFNMS(LDK(KP923879532), T1V, T1U);
339 T1W = VFMA(LDK(KP923879532), T1V, T1U);
340 T2a = VSUB(T28, T29);
341 T2f = VADD(T28, T29);
342 }
343 ST(&(x[WS(rs, 23)]), VFMAI(T1R, T1O), ms, &(x[WS(rs, 1)]));
344 ST(&(x[WS(rs, 9)]), VFNMSI(T1R, T1O), ms, &(x[WS(rs, 1)]));
345 ST(&(x[WS(rs, 25)]), VFNMSI(T1T, T1S), ms, &(x[WS(rs, 1)]));
346 ST(&(x[WS(rs, 7)]), VFMAI(T1T, T1S), ms, &(x[WS(rs, 1)]));
347 ST(&(x[WS(rs, 31)]), VFMAI(T1L, T1K), ms, &(x[WS(rs, 1)]));
348 ST(&(x[WS(rs, 1)]), VFNMSI(T1L, T1K), ms, &(x[WS(rs, 1)]));
349 ST(&(x[WS(rs, 15)]), VFMAI(T1J, T1u), ms, &(x[WS(rs, 1)]));
350 ST(&(x[WS(rs, 17)]), VFNMSI(T1J, T1u), ms, &(x[WS(rs, 1)]));
351 }
352 T2k = VFNMS(LDK(KP831469612), T2f, T2e);
353 T2g = VFMA(LDK(KP831469612), T2f, T2e);
354 }
355 }
356 }
357 }
358 {
359 V T2i, T23, T2h, T27;
360 T2i = VSUB(T22, T1Z);
361 T23 = VADD(T1Z, T22);
362 T2h = VFNMS(LDK(KP923879532), T26, T25);
363 T27 = VFMA(LDK(KP923879532), T26, T25);
364 {
365 V T2c, T24, T2j, T2l, T2d, T2b;
366 T2c = VFMA(LDK(KP831469612), T23, T1W);
367 T24 = VFNMS(LDK(KP831469612), T23, T1W);
368 T2j = VFMA(LDK(KP831469612), T2i, T2h);
369 T2l = VFNMS(LDK(KP831469612), T2i, T2h);
370 T2d = VFMA(LDK(KP831469612), T2a, T27);
371 T2b = VFNMS(LDK(KP831469612), T2a, T27);
372 ST(&(x[WS(rs, 21)]), VFNMSI(T2j, T2g), ms, &(x[WS(rs, 1)]));
373 ST(&(x[WS(rs, 11)]), VFMAI(T2j, T2g), ms, &(x[WS(rs, 1)]));
374 ST(&(x[WS(rs, 27)]), VFMAI(T2l, T2k), ms, &(x[WS(rs, 1)]));
375 ST(&(x[WS(rs, 5)]), VFNMSI(T2l, T2k), ms, &(x[WS(rs, 1)]));
376 ST(&(x[WS(rs, 3)]), VFMAI(T2d, T2c), ms, &(x[WS(rs, 1)]));
377 ST(&(x[WS(rs, 29)]), VFNMSI(T2d, T2c), ms, &(x[WS(rs, 1)]));
378 ST(&(x[WS(rs, 19)]), VFMAI(T2b, T24), ms, &(x[WS(rs, 1)]));
379 ST(&(x[WS(rs, 13)]), VFNMSI(T2b, T24), ms, &(x[WS(rs, 1)]));
380 }
381 }
382 }
383 }
384 VLEAVE();
385 }
386
387 static const tw_instr twinstr[] = {
388 VTW(0, 1),
389 VTW(0, 2),
390 VTW(0, 3),
391 VTW(0, 4),
392 VTW(0, 5),
393 VTW(0, 6),
394 VTW(0, 7),
395 VTW(0, 8),
396 VTW(0, 9),
397 VTW(0, 10),
398 VTW(0, 11),
399 VTW(0, 12),
400 VTW(0, 13),
401 VTW(0, 14),
402 VTW(0, 15),
403 VTW(0, 16),
404 VTW(0, 17),
405 VTW(0, 18),
406 VTW(0, 19),
407 VTW(0, 20),
408 VTW(0, 21),
409 VTW(0, 22),
410 VTW(0, 23),
411 VTW(0, 24),
412 VTW(0, 25),
413 VTW(0, 26),
414 VTW(0, 27),
415 VTW(0, 28),
416 VTW(0, 29),
417 VTW(0, 30),
418 VTW(0, 31),
419 {TW_NEXT, VL, 0}
420 };
421
422 static const ct_desc desc = { 32, XSIMD_STRING("t2fv_32"), twinstr, &GENUS, {119, 62, 98, 0}, 0, 0, 0 };
423
424 void XSIMD(codelet_t2fv_32) (planner *p) {
425 X(kdft_dit_register) (p, t2fv_32, &desc);
426 }
427 #else /* HAVE_FMA */
428
429 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2fv_32 -include t2f.h */
430
431 /*
432 * This function contains 217 FP additions, 104 FP multiplications,
433 * (or, 201 additions, 88 multiplications, 16 fused multiply/add),
434 * 59 stack variables, 7 constants, and 64 memory accesses
435 */
436 #include "t2f.h"
437
438 static void t2fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
439 {
440 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
441 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
442 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
443 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
444 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
445 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
446 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
447 {
448 INT m;
449 R *x;
450 x = ri;
451 for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
452 V T4, T1A, T2o, T32, Tf, T1v, T2r, T3f, TC, T1C, T2L, T34, Tr, T1D, T2O;
453 V T33, T1k, T20, T2F, T3b, T1r, T21, T2C, T3a, TV, T1X, T2y, T38, T12, T1Y;
454 V T2v, T37;
455 {
456 V T1, T1z, T3, T1x, T1y, T2, T1w, T2m, T2n;
457 T1 = LD(&(x[0]), ms, &(x[0]));
458 T1y = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
459 T1z = BYTWJ(&(W[TWVL * 46]), T1y);
460 T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
461 T3 = BYTWJ(&(W[TWVL * 30]), T2);
462 T1w = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
463 T1x = BYTWJ(&(W[TWVL * 14]), T1w);
464 T4 = VSUB(T1, T3);
465 T1A = VSUB(T1x, T1z);
466 T2m = VADD(T1, T3);
467 T2n = VADD(T1x, T1z);
468 T2o = VADD(T2m, T2n);
469 T32 = VSUB(T2m, T2n);
470 }
471 {
472 V T6, Td, T8, Tb;
473 {
474 V T5, Tc, T7, Ta;
475 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
476 T6 = BYTWJ(&(W[TWVL * 6]), T5);
477 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
478 Td = BYTWJ(&(W[TWVL * 22]), Tc);
479 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
480 T8 = BYTWJ(&(W[TWVL * 38]), T7);
481 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
482 Tb = BYTWJ(&(W[TWVL * 54]), Ta);
483 }
484 {
485 V T9, Te, T2p, T2q;
486 T9 = VSUB(T6, T8);
487 Te = VSUB(Tb, Td);
488 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
489 T1v = VMUL(LDK(KP707106781), VSUB(Te, T9));
490 T2p = VADD(T6, T8);
491 T2q = VADD(Tb, Td);
492 T2r = VADD(T2p, T2q);
493 T3f = VSUB(T2q, T2p);
494 }
495 }
496 {
497 V Tt, TA, Tv, Ty;
498 {
499 V Ts, Tz, Tu, Tx;
500 Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
501 Tt = BYTWJ(&(W[TWVL * 58]), Ts);
502 Tz = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
503 TA = BYTWJ(&(W[TWVL * 42]), Tz);
504 Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
505 Tv = BYTWJ(&(W[TWVL * 26]), Tu);
506 Tx = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
507 Ty = BYTWJ(&(W[TWVL * 10]), Tx);
508 }
509 {
510 V Tw, TB, T2J, T2K;
511 Tw = VSUB(Tt, Tv);
512 TB = VSUB(Ty, TA);
513 TC = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), TB));
514 T1C = VFNMS(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
515 T2J = VADD(Tt, Tv);
516 T2K = VADD(Ty, TA);
517 T2L = VADD(T2J, T2K);
518 T34 = VSUB(T2J, T2K);
519 }
520 }
521 {
522 V Ti, Tp, Tk, Tn;
523 {
524 V Th, To, Tj, Tm;
525 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
526 Ti = BYTWJ(&(W[TWVL * 2]), Th);
527 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
528 Tp = BYTWJ(&(W[TWVL * 50]), To);
529 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
530 Tk = BYTWJ(&(W[TWVL * 34]), Tj);
531 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
532 Tn = BYTWJ(&(W[TWVL * 18]), Tm);
533 }
534 {
535 V Tl, Tq, T2M, T2N;
536 Tl = VSUB(Ti, Tk);
537 Tq = VSUB(Tn, Tp);
538 Tr = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
539 T1D = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
540 T2M = VADD(Ti, Tk);
541 T2N = VADD(Tn, Tp);
542 T2O = VADD(T2M, T2N);
543 T33 = VSUB(T2M, T2N);
544 }
545 }
546 {
547 V T15, T17, T1p, T1n, T1f, T1h, T1i, T1a, T1c, T1d;
548 {
549 V T14, T16, T1o, T1m;
550 T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
551 T15 = BYTWJ(&(W[TWVL * 60]), T14);
552 T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
553 T17 = BYTWJ(&(W[TWVL * 28]), T16);
554 T1o = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
555 T1p = BYTWJ(&(W[TWVL * 44]), T1o);
556 T1m = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
557 T1n = BYTWJ(&(W[TWVL * 12]), T1m);
558 {
559 V T1e, T1g, T19, T1b;
560 T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
561 T1f = BYTWJ(&(W[TWVL * 52]), T1e);
562 T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
563 T1h = BYTWJ(&(W[TWVL * 20]), T1g);
564 T1i = VSUB(T1f, T1h);
565 T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
566 T1a = BYTWJ(&(W[TWVL * 4]), T19);
567 T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
568 T1c = BYTWJ(&(W[TWVL * 36]), T1b);
569 T1d = VSUB(T1a, T1c);
570 }
571 }
572 {
573 V T18, T1j, T2D, T2E;
574 T18 = VSUB(T15, T17);
575 T1j = VMUL(LDK(KP707106781), VADD(T1d, T1i));
576 T1k = VADD(T18, T1j);
577 T20 = VSUB(T18, T1j);
578 T2D = VADD(T1a, T1c);
579 T2E = VADD(T1f, T1h);
580 T2F = VADD(T2D, T2E);
581 T3b = VSUB(T2E, T2D);
582 }
583 {
584 V T1l, T1q, T2A, T2B;
585 T1l = VMUL(LDK(KP707106781), VSUB(T1i, T1d));
586 T1q = VSUB(T1n, T1p);
587 T1r = VSUB(T1l, T1q);
588 T21 = VADD(T1q, T1l);
589 T2A = VADD(T15, T17);
590 T2B = VADD(T1n, T1p);
591 T2C = VADD(T2A, T2B);
592 T3a = VSUB(T2A, T2B);
593 }
594 }
595 {
596 V TG, TI, T10, TY, TQ, TS, TT, TL, TN, TO;
597 {
598 V TF, TH, TZ, TX;
599 TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
600 TG = BYTWJ(&(W[0]), TF);
601 TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
602 TI = BYTWJ(&(W[TWVL * 32]), TH);
603 TZ = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
604 T10 = BYTWJ(&(W[TWVL * 48]), TZ);
605 TX = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
606 TY = BYTWJ(&(W[TWVL * 16]), TX);
607 {
608 V TP, TR, TK, TM;
609 TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
610 TQ = BYTWJ(&(W[TWVL * 56]), TP);
611 TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
612 TS = BYTWJ(&(W[TWVL * 24]), TR);
613 TT = VSUB(TQ, TS);
614 TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
615 TL = BYTWJ(&(W[TWVL * 8]), TK);
616 TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
617 TN = BYTWJ(&(W[TWVL * 40]), TM);
618 TO = VSUB(TL, TN);
619 }
620 }
621 {
622 V TJ, TU, T2w, T2x;
623 TJ = VSUB(TG, TI);
624 TU = VMUL(LDK(KP707106781), VADD(TO, TT));
625 TV = VADD(TJ, TU);
626 T1X = VSUB(TJ, TU);
627 T2w = VADD(TL, TN);
628 T2x = VADD(TQ, TS);
629 T2y = VADD(T2w, T2x);
630 T38 = VSUB(T2x, T2w);
631 }
632 {
633 V TW, T11, T2t, T2u;
634 TW = VMUL(LDK(KP707106781), VSUB(TT, TO));
635 T11 = VSUB(TY, T10);
636 T12 = VSUB(TW, T11);
637 T1Y = VADD(T11, TW);
638 T2t = VADD(TG, TI);
639 T2u = VADD(TY, T10);
640 T2v = VADD(T2t, T2u);
641 T37 = VSUB(T2t, T2u);
642 }
643 }
644 {
645 V T2W, T30, T2Z, T31;
646 {
647 V T2U, T2V, T2X, T2Y;
648 T2U = VADD(T2o, T2r);
649 T2V = VADD(T2O, T2L);
650 T2W = VADD(T2U, T2V);
651 T30 = VSUB(T2U, T2V);
652 T2X = VADD(T2v, T2y);
653 T2Y = VADD(T2C, T2F);
654 T2Z = VADD(T2X, T2Y);
655 T31 = VBYI(VSUB(T2Y, T2X));
656 }
657 ST(&(x[WS(rs, 16)]), VSUB(T2W, T2Z), ms, &(x[0]));
658 ST(&(x[WS(rs, 8)]), VADD(T30, T31), ms, &(x[0]));
659 ST(&(x[0]), VADD(T2W, T2Z), ms, &(x[0]));
660 ST(&(x[WS(rs, 24)]), VSUB(T30, T31), ms, &(x[0]));
661 }
662 {
663 V T2s, T2P, T2H, T2Q, T2z, T2G;
664 T2s = VSUB(T2o, T2r);
665 T2P = VSUB(T2L, T2O);
666 T2z = VSUB(T2v, T2y);
667 T2G = VSUB(T2C, T2F);
668 T2H = VMUL(LDK(KP707106781), VADD(T2z, T2G));
669 T2Q = VMUL(LDK(KP707106781), VSUB(T2G, T2z));
670 {
671 V T2I, T2R, T2S, T2T;
672 T2I = VADD(T2s, T2H);
673 T2R = VBYI(VADD(T2P, T2Q));
674 ST(&(x[WS(rs, 28)]), VSUB(T2I, T2R), ms, &(x[0]));
675 ST(&(x[WS(rs, 4)]), VADD(T2I, T2R), ms, &(x[0]));
676 T2S = VSUB(T2s, T2H);
677 T2T = VBYI(VSUB(T2Q, T2P));
678 ST(&(x[WS(rs, 20)]), VSUB(T2S, T2T), ms, &(x[0]));
679 ST(&(x[WS(rs, 12)]), VADD(T2S, T2T), ms, &(x[0]));
680 }
681 }
682 {
683 V T36, T3r, T3h, T3p, T3d, T3o, T3k, T3s, T35, T3g;
684 T35 = VMUL(LDK(KP707106781), VADD(T33, T34));
685 T36 = VADD(T32, T35);
686 T3r = VSUB(T32, T35);
687 T3g = VMUL(LDK(KP707106781), VSUB(T34, T33));
688 T3h = VADD(T3f, T3g);
689 T3p = VSUB(T3g, T3f);
690 {
691 V T39, T3c, T3i, T3j;
692 T39 = VFMA(LDK(KP923879532), T37, VMUL(LDK(KP382683432), T38));
693 T3c = VFNMS(LDK(KP382683432), T3b, VMUL(LDK(KP923879532), T3a));
694 T3d = VADD(T39, T3c);
695 T3o = VSUB(T3c, T39);
696 T3i = VFNMS(LDK(KP382683432), T37, VMUL(LDK(KP923879532), T38));
697 T3j = VFMA(LDK(KP382683432), T3a, VMUL(LDK(KP923879532), T3b));
698 T3k = VADD(T3i, T3j);
699 T3s = VSUB(T3j, T3i);
700 }
701 {
702 V T3e, T3l, T3u, T3v;
703 T3e = VADD(T36, T3d);
704 T3l = VBYI(VADD(T3h, T3k));
705 ST(&(x[WS(rs, 30)]), VSUB(T3e, T3l), ms, &(x[0]));
706 ST(&(x[WS(rs, 2)]), VADD(T3e, T3l), ms, &(x[0]));
707 T3u = VBYI(VADD(T3p, T3o));
708 T3v = VADD(T3r, T3s);
709 ST(&(x[WS(rs, 6)]), VADD(T3u, T3v), ms, &(x[0]));
710 ST(&(x[WS(rs, 26)]), VSUB(T3v, T3u), ms, &(x[0]));
711 }
712 {
713 V T3m, T3n, T3q, T3t;
714 T3m = VSUB(T36, T3d);
715 T3n = VBYI(VSUB(T3k, T3h));
716 ST(&(x[WS(rs, 18)]), VSUB(T3m, T3n), ms, &(x[0]));
717 ST(&(x[WS(rs, 14)]), VADD(T3m, T3n), ms, &(x[0]));
718 T3q = VBYI(VSUB(T3o, T3p));
719 T3t = VSUB(T3r, T3s);
720 ST(&(x[WS(rs, 10)]), VADD(T3q, T3t), ms, &(x[0]));
721 ST(&(x[WS(rs, 22)]), VSUB(T3t, T3q), ms, &(x[0]));
722 }
723 }
724 {
725 V TE, T1P, T1I, T1Q, T1t, T1M, T1F, T1N;
726 {
727 V Tg, TD, T1G, T1H;
728 Tg = VADD(T4, Tf);
729 TD = VADD(Tr, TC);
730 TE = VADD(Tg, TD);
731 T1P = VSUB(Tg, TD);
732 T1G = VFNMS(LDK(KP195090322), TV, VMUL(LDK(KP980785280), T12));
733 T1H = VFMA(LDK(KP195090322), T1k, VMUL(LDK(KP980785280), T1r));
734 T1I = VADD(T1G, T1H);
735 T1Q = VSUB(T1H, T1G);
736 }
737 {
738 V T13, T1s, T1B, T1E;
739 T13 = VFMA(LDK(KP980785280), TV, VMUL(LDK(KP195090322), T12));
740 T1s = VFNMS(LDK(KP195090322), T1r, VMUL(LDK(KP980785280), T1k));
741 T1t = VADD(T13, T1s);
742 T1M = VSUB(T1s, T13);
743 T1B = VSUB(T1v, T1A);
744 T1E = VSUB(T1C, T1D);
745 T1F = VADD(T1B, T1E);
746 T1N = VSUB(T1E, T1B);
747 }
748 {
749 V T1u, T1J, T1S, T1T;
750 T1u = VADD(TE, T1t);
751 T1J = VBYI(VADD(T1F, T1I));
752 ST(&(x[WS(rs, 31)]), VSUB(T1u, T1J), ms, &(x[WS(rs, 1)]));
753 ST(&(x[WS(rs, 1)]), VADD(T1u, T1J), ms, &(x[WS(rs, 1)]));
754 T1S = VBYI(VADD(T1N, T1M));
755 T1T = VADD(T1P, T1Q);
756 ST(&(x[WS(rs, 7)]), VADD(T1S, T1T), ms, &(x[WS(rs, 1)]));
757 ST(&(x[WS(rs, 25)]), VSUB(T1T, T1S), ms, &(x[WS(rs, 1)]));
758 }
759 {
760 V T1K, T1L, T1O, T1R;
761 T1K = VSUB(TE, T1t);
762 T1L = VBYI(VSUB(T1I, T1F));
763 ST(&(x[WS(rs, 17)]), VSUB(T1K, T1L), ms, &(x[WS(rs, 1)]));
764 ST(&(x[WS(rs, 15)]), VADD(T1K, T1L), ms, &(x[WS(rs, 1)]));
765 T1O = VBYI(VSUB(T1M, T1N));
766 T1R = VSUB(T1P, T1Q);
767 ST(&(x[WS(rs, 9)]), VADD(T1O, T1R), ms, &(x[WS(rs, 1)]));
768 ST(&(x[WS(rs, 23)]), VSUB(T1R, T1O), ms, &(x[WS(rs, 1)]));
769 }
770 }
771 {
772 V T1W, T2h, T2a, T2i, T23, T2e, T27, T2f;
773 {
774 V T1U, T1V, T28, T29;
775 T1U = VSUB(T4, Tf);
776 T1V = VADD(T1D, T1C);
777 T1W = VADD(T1U, T1V);
778 T2h = VSUB(T1U, T1V);
779 T28 = VFNMS(LDK(KP555570233), T1X, VMUL(LDK(KP831469612), T1Y));
780 T29 = VFMA(LDK(KP555570233), T20, VMUL(LDK(KP831469612), T21));
781 T2a = VADD(T28, T29);
782 T2i = VSUB(T29, T28);
783 }
784 {
785 V T1Z, T22, T25, T26;
786 T1Z = VFMA(LDK(KP831469612), T1X, VMUL(LDK(KP555570233), T1Y));
787 T22 = VFNMS(LDK(KP555570233), T21, VMUL(LDK(KP831469612), T20));
788 T23 = VADD(T1Z, T22);
789 T2e = VSUB(T22, T1Z);
790 T25 = VADD(T1A, T1v);
791 T26 = VSUB(TC, Tr);
792 T27 = VADD(T25, T26);
793 T2f = VSUB(T26, T25);
794 }
795 {
796 V T24, T2b, T2k, T2l;
797 T24 = VADD(T1W, T23);
798 T2b = VBYI(VADD(T27, T2a));
799 ST(&(x[WS(rs, 29)]), VSUB(T24, T2b), ms, &(x[WS(rs, 1)]));
800 ST(&(x[WS(rs, 3)]), VADD(T24, T2b), ms, &(x[WS(rs, 1)]));
801 T2k = VBYI(VADD(T2f, T2e));
802 T2l = VADD(T2h, T2i);
803 ST(&(x[WS(rs, 5)]), VADD(T2k, T2l), ms, &(x[WS(rs, 1)]));
804 ST(&(x[WS(rs, 27)]), VSUB(T2l, T2k), ms, &(x[WS(rs, 1)]));
805 }
806 {
807 V T2c, T2d, T2g, T2j;
808 T2c = VSUB(T1W, T23);
809 T2d = VBYI(VSUB(T2a, T27));
810 ST(&(x[WS(rs, 19)]), VSUB(T2c, T2d), ms, &(x[WS(rs, 1)]));
811 ST(&(x[WS(rs, 13)]), VADD(T2c, T2d), ms, &(x[WS(rs, 1)]));
812 T2g = VBYI(VSUB(T2e, T2f));
813 T2j = VSUB(T2h, T2i);
814 ST(&(x[WS(rs, 11)]), VADD(T2g, T2j), ms, &(x[WS(rs, 1)]));
815 ST(&(x[WS(rs, 21)]), VSUB(T2j, T2g), ms, &(x[WS(rs, 1)]));
816 }
817 }
818 }
819 }
820 VLEAVE();
821 }
822
823 static const tw_instr twinstr[] = {
824 VTW(0, 1),
825 VTW(0, 2),
826 VTW(0, 3),
827 VTW(0, 4),
828 VTW(0, 5),
829 VTW(0, 6),
830 VTW(0, 7),
831 VTW(0, 8),
832 VTW(0, 9),
833 VTW(0, 10),
834 VTW(0, 11),
835 VTW(0, 12),
836 VTW(0, 13),
837 VTW(0, 14),
838 VTW(0, 15),
839 VTW(0, 16),
840 VTW(0, 17),
841 VTW(0, 18),
842 VTW(0, 19),
843 VTW(0, 20),
844 VTW(0, 21),
845 VTW(0, 22),
846 VTW(0, 23),
847 VTW(0, 24),
848 VTW(0, 25),
849 VTW(0, 26),
850 VTW(0, 27),
851 VTW(0, 28),
852 VTW(0, 29),
853 VTW(0, 30),
854 VTW(0, 31),
855 {TW_NEXT, VL, 0}
856 };
857
858 static const ct_desc desc = { 32, XSIMD_STRING("t2fv_32"), twinstr, &GENUS, {201, 88, 16, 0}, 0, 0, 0 };
859
860 void XSIMD(codelet_t2fv_32) (planner *p) {
861 X(kdft_dit_register) (p, t2fv_32, &desc);
862 }
863 #endif /* HAVE_FMA */