comparison src/fftw-3.3.3/dft/simd/common/t1sv_16.c @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Sun Nov 25 07:39:24 EST 2012 */
23
24 #include "codelet-dft.h"
25
26 #ifdef HAVE_FMA
27
28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1sv_16 -include ts.h */
29
30 /*
31 * This function contains 174 FP additions, 100 FP multiplications,
32 * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
33 * 113 stack variables, 3 constants, and 64 memory accesses
34 */
35 #include "ts.h"
36
37 static void t1sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
42 {
43 INT m;
44 for (m = mb, W = W + (mb * 30); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 30), MAKE_VOLATILE_STRIDE(32, rs)) {
45 V T2S, T2O, T2B, T2j, T2A, T24, T3J, T3L, T2Q, T2I, T2R, T2L, T2C, T2y, T3D;
46 V T3F;
47 {
48 V T3o, T3z, T1I, T8, T35, T2o, T1s, T2r, T36, T2w, T1F, T2p, T1N, T3k, Tl;
49 V T3A, T2V, T1T, Tz, T1U, T30, T29, T11, T2c, TH, TK, TJ, T31, T2h, T1e;
50 V T2a, T1Z, TI, T1Y, TF;
51 {
52 V Ta, Td, Tg, Tj, T2t, T1y, Tf, T1J, Tb, Tc, T2v, T1E, Ti;
53 {
54 V T1, T3n, T3, T6, T5, T1h, T1k, T1n, T1q, T1m, T3l, T4, T1j, T1p, T2k;
55 V T1i, T2, T1g;
56 T1 = LD(&(ri[0]), ms, &(ri[0]));
57 T3n = LD(&(ii[0]), ms, &(ii[0]));
58 T3 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
59 T6 = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
60 T2 = LDW(&(W[TWVL * 14]));
61 T5 = LDW(&(W[TWVL * 15]));
62 T1h = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
63 T1k = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
64 T1g = LDW(&(W[TWVL * 28]));
65 T1n = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
66 T1q = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
67 T1m = LDW(&(W[TWVL * 12]));
68 T3l = VMUL(T2, T6);
69 T4 = VMUL(T2, T3);
70 T1j = LDW(&(W[TWVL * 29]));
71 T1p = LDW(&(W[TWVL * 13]));
72 T2k = VMUL(T1g, T1k);
73 T1i = VMUL(T1g, T1h);
74 {
75 V T1u, T1x, T1A, T2s, T1v, T1D, T1z, T1w, T1C, T2u, T1B, T9;
76 {
77 V T2l, T1l, T1t, T2n, T1r;
78 {
79 V T2m, T1o, T3m, T7;
80 T1u = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
81 T2m = VMUL(T1m, T1q);
82 T1o = VMUL(T1m, T1n);
83 T3m = VFNMS(T5, T3, T3l);
84 T7 = VFMA(T5, T6, T4);
85 T1x = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
86 T2l = VFNMS(T1j, T1h, T2k);
87 T1l = VFMA(T1j, T1k, T1i);
88 T1t = LDW(&(W[TWVL * 4]));
89 T2n = VFNMS(T1p, T1n, T2m);
90 T1r = VFMA(T1p, T1q, T1o);
91 T3o = VADD(T3m, T3n);
92 T3z = VSUB(T3n, T3m);
93 T1I = VSUB(T1, T7);
94 T8 = VADD(T1, T7);
95 }
96 T1A = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
97 T2s = VMUL(T1t, T1x);
98 T1v = VMUL(T1t, T1u);
99 T35 = VADD(T2l, T2n);
100 T2o = VSUB(T2l, T2n);
101 T1s = VADD(T1l, T1r);
102 T2r = VSUB(T1l, T1r);
103 T1D = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
104 T1z = LDW(&(W[TWVL * 20]));
105 }
106 T1w = LDW(&(W[TWVL * 5]));
107 T1C = LDW(&(W[TWVL * 21]));
108 Ta = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
109 Td = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
110 T9 = LDW(&(W[TWVL * 6]));
111 Tg = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
112 Tj = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
113 T2u = VMUL(T1z, T1D);
114 T1B = VMUL(T1z, T1A);
115 T2t = VFNMS(T1w, T1u, T2s);
116 T1y = VFMA(T1w, T1x, T1v);
117 Tf = LDW(&(W[TWVL * 22]));
118 T1J = VMUL(T9, Td);
119 Tb = VMUL(T9, Ta);
120 Tc = LDW(&(W[TWVL * 7]));
121 T2v = VFNMS(T1C, T1A, T2u);
122 T1E = VFMA(T1C, T1D, T1B);
123 Ti = LDW(&(W[TWVL * 23]));
124 }
125 }
126 {
127 V TW, TZ, TY, T27, TX, T26, TU;
128 {
129 V To, Tr, Tu, Tx, Tq, Tw, T1P, Tp, T1R, Tv;
130 {
131 V T1K, Te, T1M, Tk, Tn, Tt, T1L, Th;
132 To = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
133 T1L = VMUL(Tf, Tj);
134 Th = VMUL(Tf, Tg);
135 Tr = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
136 T1K = VFNMS(Tc, Ta, T1J);
137 Te = VFMA(Tc, Td, Tb);
138 T36 = VADD(T2t, T2v);
139 T2w = VSUB(T2t, T2v);
140 T1F = VADD(T1y, T1E);
141 T2p = VSUB(T1y, T1E);
142 T1M = VFNMS(Ti, Tg, T1L);
143 Tk = VFMA(Ti, Tj, Th);
144 Tn = LDW(&(W[TWVL * 2]));
145 Tu = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
146 Tx = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
147 Tt = LDW(&(W[TWVL * 18]));
148 Tq = LDW(&(W[TWVL * 3]));
149 Tw = LDW(&(W[TWVL * 19]));
150 T1N = VSUB(T1K, T1M);
151 T3k = VADD(T1K, T1M);
152 Tl = VADD(Te, Tk);
153 T3A = VSUB(Te, Tk);
154 T1P = VMUL(Tn, Tr);
155 Tp = VMUL(Tn, To);
156 T1R = VMUL(Tt, Tx);
157 Tv = VMUL(Tt, Tu);
158 }
159 {
160 V TQ, TT, T1Q, Ts, T1S, Ty, TV, T25, TR, TP, TS;
161 TQ = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
162 TT = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
163 TP = LDW(&(W[0]));
164 TW = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
165 T1Q = VFNMS(Tq, To, T1P);
166 Ts = VFMA(Tq, Tr, Tp);
167 T1S = VFNMS(Tw, Tu, T1R);
168 Ty = VFMA(Tw, Tx, Tv);
169 TZ = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
170 TV = LDW(&(W[TWVL * 16]));
171 T25 = VMUL(TP, TT);
172 TR = VMUL(TP, TQ);
173 TS = LDW(&(W[TWVL * 1]));
174 TY = LDW(&(W[TWVL * 17]));
175 T2V = VADD(T1Q, T1S);
176 T1T = VSUB(T1Q, T1S);
177 Tz = VADD(Ts, Ty);
178 T1U = VSUB(Ts, Ty);
179 T27 = VMUL(TV, TZ);
180 TX = VMUL(TV, TW);
181 T26 = VFNMS(TS, TQ, T25);
182 TU = VFMA(TS, TT, TR);
183 }
184 }
185 {
186 V T19, T1c, T1b, T2f, T1a, T2e, T17;
187 {
188 V T13, T16, T12, T28, T10, T18, T15, T2d, T14;
189 T13 = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
190 T16 = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
191 T12 = LDW(&(W[TWVL * 8]));
192 T19 = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
193 T28 = VFNMS(TY, TW, T27);
194 T10 = VFMA(TY, TZ, TX);
195 T1c = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
196 T18 = LDW(&(W[TWVL * 24]));
197 T15 = LDW(&(W[TWVL * 9]));
198 T1b = LDW(&(W[TWVL * 25]));
199 T2d = VMUL(T12, T16);
200 T14 = VMUL(T12, T13);
201 T30 = VADD(T26, T28);
202 T29 = VSUB(T26, T28);
203 T11 = VADD(TU, T10);
204 T2c = VSUB(TU, T10);
205 T2f = VMUL(T18, T1c);
206 T1a = VMUL(T18, T19);
207 T2e = VFNMS(T15, T13, T2d);
208 T17 = VFMA(T15, T16, T14);
209 }
210 {
211 V TB, TE, TA, T2g, T1d, TG, TD, T1X, TC;
212 TB = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
213 TE = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
214 TA = LDW(&(W[TWVL * 26]));
215 TH = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
216 T2g = VFNMS(T1b, T19, T2f);
217 T1d = VFMA(T1b, T1c, T1a);
218 TK = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
219 TG = LDW(&(W[TWVL * 10]));
220 TD = LDW(&(W[TWVL * 27]));
221 TJ = LDW(&(W[TWVL * 11]));
222 T1X = VMUL(TA, TE);
223 TC = VMUL(TA, TB);
224 T31 = VADD(T2e, T2g);
225 T2h = VSUB(T2e, T2g);
226 T1e = VADD(T17, T1d);
227 T2a = VSUB(T17, T1d);
228 T1Z = VMUL(TG, TK);
229 TI = VMUL(TG, TH);
230 T1Y = VFNMS(TD, TB, T1X);
231 TF = VFMA(TD, TE, TC);
232 }
233 }
234 }
235 }
236 {
237 V T2U, Tm, T3p, T3u, T34, T1G, T1f, T2Z, T20, TL, T32, T3f, T3g, T37;
238 T2U = VSUB(T8, Tl);
239 Tm = VADD(T8, Tl);
240 T3p = VADD(T3k, T3o);
241 T3u = VSUB(T3o, T3k);
242 T34 = VSUB(T1s, T1F);
243 T1G = VADD(T1s, T1F);
244 T1f = VADD(T11, T1e);
245 T2Z = VSUB(T11, T1e);
246 T20 = VFNMS(TJ, TH, T1Z);
247 TL = VFMA(TJ, TK, TI);
248 T32 = VSUB(T30, T31);
249 T3f = VADD(T30, T31);
250 T3g = VADD(T35, T36);
251 T37 = VSUB(T35, T36);
252 {
253 V T3r, T1H, T21, T1W, T3i, T3h, T3j, T2X, TN, T3t, T2W, TM;
254 T3r = VSUB(T1G, T1f);
255 T1H = VADD(T1f, T1G);
256 T21 = VSUB(T1Y, T20);
257 T2W = VADD(T1Y, T20);
258 T1W = VSUB(TF, TL);
259 TM = VADD(TF, TL);
260 T3i = VADD(T3f, T3g);
261 T3h = VSUB(T3f, T3g);
262 T3j = VADD(T2V, T2W);
263 T2X = VSUB(T2V, T2W);
264 TN = VADD(Tz, TM);
265 T3t = VSUB(TM, Tz);
266 {
267 V T2E, T1O, T3B, T3H, T2x, T2q, T2K, T2J, T3C, T23, T3I, T2H;
268 {
269 V T2F, T1V, T22, T2G;
270 T2E = VADD(T1I, T1N);
271 T1O = VSUB(T1I, T1N);
272 {
273 V T3b, T33, T3c, T38;
274 T3b = VSUB(T32, T2Z);
275 T33 = VADD(T2Z, T32);
276 T3c = VADD(T34, T37);
277 T38 = VSUB(T34, T37);
278 {
279 V T3a, T2Y, T3s, T3q;
280 T3a = VSUB(T2U, T2X);
281 T2Y = VADD(T2U, T2X);
282 T3s = VSUB(T3p, T3j);
283 T3q = VADD(T3j, T3p);
284 {
285 V T3x, T3v, T3e, TO;
286 T3x = VSUB(T3u, T3t);
287 T3v = VADD(T3t, T3u);
288 T3e = VSUB(Tm, TN);
289 TO = VADD(Tm, TN);
290 {
291 V T3d, T3w, T3y, T39;
292 T3d = VSUB(T3b, T3c);
293 T3w = VADD(T3b, T3c);
294 T3y = VSUB(T38, T33);
295 T39 = VADD(T33, T38);
296 ST(&(ii[WS(rs, 4)]), VADD(T3r, T3s), ms, &(ii[0]));
297 ST(&(ii[WS(rs, 12)]), VSUB(T3s, T3r), ms, &(ii[0]));
298 ST(&(ii[0]), VADD(T3i, T3q), ms, &(ii[0]));
299 ST(&(ii[WS(rs, 8)]), VSUB(T3q, T3i), ms, &(ii[0]));
300 ST(&(ri[WS(rs, 4)]), VADD(T3e, T3h), ms, &(ri[0]));
301 ST(&(ri[WS(rs, 12)]), VSUB(T3e, T3h), ms, &(ri[0]));
302 ST(&(ri[0]), VADD(TO, T1H), ms, &(ri[0]));
303 ST(&(ri[WS(rs, 8)]), VSUB(TO, T1H), ms, &(ri[0]));
304 ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP707106781), T3d, T3a), ms, &(ri[0]));
305 ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3d, T3a), ms, &(ri[0]));
306 ST(&(ii[WS(rs, 10)]), VFNMS(LDK(KP707106781), T3w, T3v), ms, &(ii[0]));
307 ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP707106781), T3w, T3v), ms, &(ii[0]));
308 ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3y, T3x), ms, &(ii[0]));
309 ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP707106781), T3y, T3x), ms, &(ii[0]));
310 ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP707106781), T39, T2Y), ms, &(ri[0]));
311 ST(&(ri[WS(rs, 10)]), VFNMS(LDK(KP707106781), T39, T2Y), ms, &(ri[0]));
312 T3B = VSUB(T3z, T3A);
313 T3H = VADD(T3A, T3z);
314 }
315 }
316 }
317 }
318 T2F = VADD(T1U, T1T);
319 T1V = VSUB(T1T, T1U);
320 T22 = VADD(T1W, T21);
321 T2G = VSUB(T1W, T21);
322 {
323 V T2M, T2N, T2b, T2i;
324 T2x = VSUB(T2r, T2w);
325 T2M = VADD(T2r, T2w);
326 T2N = VSUB(T2o, T2p);
327 T2q = VADD(T2o, T2p);
328 T2K = VSUB(T29, T2a);
329 T2b = VADD(T29, T2a);
330 T2i = VSUB(T2c, T2h);
331 T2J = VADD(T2c, T2h);
332 T3C = VADD(T1V, T22);
333 T23 = VSUB(T1V, T22);
334 T2S = VFMA(LDK(KP414213562), T2M, T2N);
335 T2O = VFNMS(LDK(KP414213562), T2N, T2M);
336 T3I = VSUB(T2G, T2F);
337 T2H = VADD(T2F, T2G);
338 T2B = VFNMS(LDK(KP414213562), T2b, T2i);
339 T2j = VFMA(LDK(KP414213562), T2i, T2b);
340 }
341 }
342 T2A = VFNMS(LDK(KP707106781), T23, T1O);
343 T24 = VFMA(LDK(KP707106781), T23, T1O);
344 T3J = VFMA(LDK(KP707106781), T3I, T3H);
345 T3L = VFNMS(LDK(KP707106781), T3I, T3H);
346 T2Q = VFNMS(LDK(KP707106781), T2H, T2E);
347 T2I = VFMA(LDK(KP707106781), T2H, T2E);
348 T2R = VFNMS(LDK(KP414213562), T2J, T2K);
349 T2L = VFMA(LDK(KP414213562), T2K, T2J);
350 T2C = VFMA(LDK(KP414213562), T2q, T2x);
351 T2y = VFNMS(LDK(KP414213562), T2x, T2q);
352 T3D = VFMA(LDK(KP707106781), T3C, T3B);
353 T3F = VFNMS(LDK(KP707106781), T3C, T3B);
354 }
355 }
356 }
357 }
358 {
359 V T3E, T2T, T2P, T3G;
360 T3E = VADD(T2R, T2S);
361 T2T = VSUB(T2R, T2S);
362 T2P = VADD(T2L, T2O);
363 T3G = VSUB(T2O, T2L);
364 {
365 V T3K, T2D, T2z, T3M;
366 T3K = VSUB(T2C, T2B);
367 T2D = VADD(T2B, T2C);
368 T2z = VSUB(T2j, T2y);
369 T3M = VADD(T2j, T2y);
370 ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP923879532), T2T, T2Q), ms, &(ri[WS(rs, 1)]));
371 ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP923879532), T2T, T2Q), ms, &(ri[WS(rs, 1)]));
372 ST(&(ii[WS(rs, 9)]), VFNMS(LDK(KP923879532), T3E, T3D), ms, &(ii[WS(rs, 1)]));
373 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP923879532), T3E, T3D), ms, &(ii[WS(rs, 1)]));
374 ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP923879532), T3G, T3F), ms, &(ii[WS(rs, 1)]));
375 ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP923879532), T3G, T3F), ms, &(ii[WS(rs, 1)]));
376 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP923879532), T2P, T2I), ms, &(ri[WS(rs, 1)]));
377 ST(&(ri[WS(rs, 9)]), VFNMS(LDK(KP923879532), T2P, T2I), ms, &(ri[WS(rs, 1)]));
378 ST(&(ri[WS(rs, 15)]), VFMA(LDK(KP923879532), T2D, T2A), ms, &(ri[WS(rs, 1)]));
379 ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP923879532), T2D, T2A), ms, &(ri[WS(rs, 1)]));
380 ST(&(ii[WS(rs, 11)]), VFNMS(LDK(KP923879532), T3K, T3J), ms, &(ii[WS(rs, 1)]));
381 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP923879532), T3K, T3J), ms, &(ii[WS(rs, 1)]));
382 ST(&(ii[WS(rs, 15)]), VFMA(LDK(KP923879532), T3M, T3L), ms, &(ii[WS(rs, 1)]));
383 ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP923879532), T3M, T3L), ms, &(ii[WS(rs, 1)]));
384 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP923879532), T2z, T24), ms, &(ri[WS(rs, 1)]));
385 ST(&(ri[WS(rs, 11)]), VFNMS(LDK(KP923879532), T2z, T24), ms, &(ri[WS(rs, 1)]));
386 }
387 }
388 }
389 }
390 VLEAVE();
391 }
392
393 static const tw_instr twinstr[] = {
394 VTW(0, 1),
395 VTW(0, 2),
396 VTW(0, 3),
397 VTW(0, 4),
398 VTW(0, 5),
399 VTW(0, 6),
400 VTW(0, 7),
401 VTW(0, 8),
402 VTW(0, 9),
403 VTW(0, 10),
404 VTW(0, 11),
405 VTW(0, 12),
406 VTW(0, 13),
407 VTW(0, 14),
408 VTW(0, 15),
409 {TW_NEXT, (2 * VL), 0}
410 };
411
412 static const ct_desc desc = { 16, XSIMD_STRING("t1sv_16"), twinstr, &GENUS, {104, 30, 70, 0}, 0, 0, 0 };
413
414 void XSIMD(codelet_t1sv_16) (planner *p) {
415 X(kdft_dit_register) (p, t1sv_16, &desc);
416 }
417 #else /* HAVE_FMA */
418
419 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1sv_16 -include ts.h */
420
421 /*
422 * This function contains 174 FP additions, 84 FP multiplications,
423 * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
424 * 52 stack variables, 3 constants, and 64 memory accesses
425 */
426 #include "ts.h"
427
428 static void t1sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
429 {
430 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
431 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
432 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
433 {
434 INT m;
435 for (m = mb, W = W + (mb * 30); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 30), MAKE_VOLATILE_STRIDE(32, rs)) {
436 V T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
437 V T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
438 V T2y, T2z, T1O, T2g, T1T, T2h;
439 {
440 V T1, T2T, T6, T2S;
441 T1 = LD(&(ri[0]), ms, &(ri[0]));
442 T2T = LD(&(ii[0]), ms, &(ii[0]));
443 {
444 V T3, T5, T2, T4;
445 T3 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
446 T5 = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
447 T2 = LDW(&(W[TWVL * 14]));
448 T4 = LDW(&(W[TWVL * 15]));
449 T6 = VFMA(T2, T3, VMUL(T4, T5));
450 T2S = VFNMS(T4, T3, VMUL(T2, T5));
451 }
452 T7 = VADD(T1, T6);
453 T37 = VSUB(T2T, T2S);
454 T1t = VSUB(T1, T6);
455 T2U = VADD(T2S, T2T);
456 }
457 {
458 V Tc, T1u, Th, T1v;
459 {
460 V T9, Tb, T8, Ta;
461 T9 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
462 Tb = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
463 T8 = LDW(&(W[TWVL * 6]));
464 Ta = LDW(&(W[TWVL * 7]));
465 Tc = VFMA(T8, T9, VMUL(Ta, Tb));
466 T1u = VFNMS(Ta, T9, VMUL(T8, Tb));
467 }
468 {
469 V Te, Tg, Td, Tf;
470 Te = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
471 Tg = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
472 Td = LDW(&(W[TWVL * 22]));
473 Tf = LDW(&(W[TWVL * 23]));
474 Th = VFMA(Td, Te, VMUL(Tf, Tg));
475 T1v = VFNMS(Tf, Te, VMUL(Td, Tg));
476 }
477 Ti = VADD(Tc, Th);
478 T38 = VSUB(Tc, Th);
479 T1w = VSUB(T1u, T1v);
480 T2R = VADD(T1u, T1v);
481 }
482 {
483 V To, T1y, Tt, T1z, T1A, T1B;
484 {
485 V Tl, Tn, Tk, Tm;
486 Tl = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
487 Tn = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
488 Tk = LDW(&(W[TWVL * 2]));
489 Tm = LDW(&(W[TWVL * 3]));
490 To = VFMA(Tk, Tl, VMUL(Tm, Tn));
491 T1y = VFNMS(Tm, Tl, VMUL(Tk, Tn));
492 }
493 {
494 V Tq, Ts, Tp, Tr;
495 Tq = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
496 Ts = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
497 Tp = LDW(&(W[TWVL * 18]));
498 Tr = LDW(&(W[TWVL * 19]));
499 Tt = VFMA(Tp, Tq, VMUL(Tr, Ts));
500 T1z = VFNMS(Tr, Tq, VMUL(Tp, Ts));
501 }
502 Tu = VADD(To, Tt);
503 T2s = VADD(T1y, T1z);
504 T1A = VSUB(T1y, T1z);
505 T1B = VSUB(To, Tt);
506 T1C = VSUB(T1A, T1B);
507 T2c = VADD(T1B, T1A);
508 }
509 {
510 V Tz, T1E, TE, T1F, T1D, T1G;
511 {
512 V Tw, Ty, Tv, Tx;
513 Tw = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
514 Ty = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
515 Tv = LDW(&(W[TWVL * 26]));
516 Tx = LDW(&(W[TWVL * 27]));
517 Tz = VFMA(Tv, Tw, VMUL(Tx, Ty));
518 T1E = VFNMS(Tx, Tw, VMUL(Tv, Ty));
519 }
520 {
521 V TB, TD, TA, TC;
522 TB = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
523 TD = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
524 TA = LDW(&(W[TWVL * 10]));
525 TC = LDW(&(W[TWVL * 11]));
526 TE = VFMA(TA, TB, VMUL(TC, TD));
527 T1F = VFNMS(TC, TB, VMUL(TA, TD));
528 }
529 TF = VADD(Tz, TE);
530 T2t = VADD(T1E, T1F);
531 T1D = VSUB(Tz, TE);
532 T1G = VSUB(T1E, T1F);
533 T1H = VADD(T1D, T1G);
534 T2d = VSUB(T1D, T1G);
535 }
536 {
537 V T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
538 {
539 V T16, T18, T15, T17;
540 T16 = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
541 T18 = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
542 T15 = LDW(&(W[TWVL * 28]));
543 T17 = LDW(&(W[TWVL * 29]));
544 T19 = VFMA(T15, T16, VMUL(T17, T18));
545 T20 = VFNMS(T17, T16, VMUL(T15, T18));
546 }
547 {
548 V T1m, T1o, T1l, T1n;
549 T1m = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
550 T1o = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
551 T1l = LDW(&(W[TWVL * 20]));
552 T1n = LDW(&(W[TWVL * 21]));
553 T1p = VFMA(T1l, T1m, VMUL(T1n, T1o));
554 T1X = VFNMS(T1n, T1m, VMUL(T1l, T1o));
555 }
556 {
557 V T1b, T1d, T1a, T1c;
558 T1b = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
559 T1d = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
560 T1a = LDW(&(W[TWVL * 12]));
561 T1c = LDW(&(W[TWVL * 13]));
562 T1e = VFMA(T1a, T1b, VMUL(T1c, T1d));
563 T21 = VFNMS(T1c, T1b, VMUL(T1a, T1d));
564 }
565 {
566 V T1h, T1j, T1g, T1i;
567 T1h = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
568 T1j = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
569 T1g = LDW(&(W[TWVL * 4]));
570 T1i = LDW(&(W[TWVL * 5]));
571 T1k = VFMA(T1g, T1h, VMUL(T1i, T1j));
572 T1W = VFNMS(T1i, T1h, VMUL(T1g, T1j));
573 }
574 T1f = VADD(T19, T1e);
575 T1q = VADD(T1k, T1p);
576 T2B = VSUB(T1f, T1q);
577 T2C = VADD(T20, T21);
578 T2D = VADD(T1W, T1X);
579 T2E = VSUB(T2C, T2D);
580 {
581 V T1V, T1Y, T22, T23;
582 T1V = VSUB(T19, T1e);
583 T1Y = VSUB(T1W, T1X);
584 T1Z = VSUB(T1V, T1Y);
585 T2j = VADD(T1V, T1Y);
586 T22 = VSUB(T20, T21);
587 T23 = VSUB(T1k, T1p);
588 T24 = VADD(T22, T23);
589 T2k = VSUB(T22, T23);
590 }
591 }
592 {
593 V TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
594 {
595 V TJ, TL, TI, TK;
596 TJ = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
597 TL = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
598 TI = LDW(&(W[0]));
599 TK = LDW(&(W[TWVL * 1]));
600 TM = VFMA(TI, TJ, VMUL(TK, TL));
601 T1K = VFNMS(TK, TJ, VMUL(TI, TL));
602 }
603 {
604 V TZ, T11, TY, T10;
605 TZ = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
606 T11 = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
607 TY = LDW(&(W[TWVL * 24]));
608 T10 = LDW(&(W[TWVL * 25]));
609 T12 = VFMA(TY, TZ, VMUL(T10, T11));
610 T1R = VFNMS(T10, TZ, VMUL(TY, T11));
611 }
612 {
613 V TO, TQ, TN, TP;
614 TO = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
615 TQ = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
616 TN = LDW(&(W[TWVL * 16]));
617 TP = LDW(&(W[TWVL * 17]));
618 TR = VFMA(TN, TO, VMUL(TP, TQ));
619 T1L = VFNMS(TP, TO, VMUL(TN, TQ));
620 }
621 {
622 V TU, TW, TT, TV;
623 TU = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
624 TW = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
625 TT = LDW(&(W[TWVL * 8]));
626 TV = LDW(&(W[TWVL * 9]));
627 TX = VFMA(TT, TU, VMUL(TV, TW));
628 T1Q = VFNMS(TV, TU, VMUL(TT, TW));
629 }
630 TS = VADD(TM, TR);
631 T13 = VADD(TX, T12);
632 T2w = VSUB(TS, T13);
633 T2x = VADD(T1K, T1L);
634 T2y = VADD(T1Q, T1R);
635 T2z = VSUB(T2x, T2y);
636 {
637 V T1M, T1N, T1P, T1S;
638 T1M = VSUB(T1K, T1L);
639 T1N = VSUB(TX, T12);
640 T1O = VADD(T1M, T1N);
641 T2g = VSUB(T1M, T1N);
642 T1P = VSUB(TM, TR);
643 T1S = VSUB(T1Q, T1R);
644 T1T = VSUB(T1P, T1S);
645 T2h = VADD(T1P, T1S);
646 }
647 }
648 {
649 V T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
650 {
651 V T1x, T1I, T3e, T3f;
652 T1x = VSUB(T1t, T1w);
653 T1I = VMUL(LDK(KP707106781), VSUB(T1C, T1H));
654 T1J = VADD(T1x, T1I);
655 T27 = VSUB(T1x, T1I);
656 T3e = VMUL(LDK(KP707106781), VSUB(T2d, T2c));
657 T3f = VADD(T38, T37);
658 T3g = VADD(T3e, T3f);
659 T3i = VSUB(T3f, T3e);
660 }
661 {
662 V T1U, T25, T28, T29;
663 T1U = VFMA(LDK(KP923879532), T1O, VMUL(LDK(KP382683432), T1T));
664 T25 = VFNMS(LDK(KP923879532), T24, VMUL(LDK(KP382683432), T1Z));
665 T26 = VADD(T1U, T25);
666 T3h = VSUB(T25, T1U);
667 T28 = VFNMS(LDK(KP923879532), T1T, VMUL(LDK(KP382683432), T1O));
668 T29 = VFMA(LDK(KP382683432), T24, VMUL(LDK(KP923879532), T1Z));
669 T2a = VSUB(T28, T29);
670 T3d = VADD(T28, T29);
671 }
672 ST(&(ri[WS(rs, 11)]), VSUB(T1J, T26), ms, &(ri[WS(rs, 1)]));
673 ST(&(ii[WS(rs, 11)]), VSUB(T3g, T3d), ms, &(ii[WS(rs, 1)]));
674 ST(&(ri[WS(rs, 3)]), VADD(T1J, T26), ms, &(ri[WS(rs, 1)]));
675 ST(&(ii[WS(rs, 3)]), VADD(T3d, T3g), ms, &(ii[WS(rs, 1)]));
676 ST(&(ri[WS(rs, 15)]), VSUB(T27, T2a), ms, &(ri[WS(rs, 1)]));
677 ST(&(ii[WS(rs, 15)]), VSUB(T3i, T3h), ms, &(ii[WS(rs, 1)]));
678 ST(&(ri[WS(rs, 7)]), VADD(T27, T2a), ms, &(ri[WS(rs, 1)]));
679 ST(&(ii[WS(rs, 7)]), VADD(T3h, T3i), ms, &(ii[WS(rs, 1)]));
680 }
681 {
682 V T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
683 {
684 V T2r, T2u, T30, T31;
685 T2r = VSUB(T7, Ti);
686 T2u = VSUB(T2s, T2t);
687 T2v = VADD(T2r, T2u);
688 T2H = VSUB(T2r, T2u);
689 T30 = VSUB(TF, Tu);
690 T31 = VSUB(T2U, T2R);
691 T32 = VADD(T30, T31);
692 T34 = VSUB(T31, T30);
693 }
694 {
695 V T2A, T2F, T2I, T2J;
696 T2A = VADD(T2w, T2z);
697 T2F = VSUB(T2B, T2E);
698 T2G = VMUL(LDK(KP707106781), VADD(T2A, T2F));
699 T33 = VMUL(LDK(KP707106781), VSUB(T2F, T2A));
700 T2I = VSUB(T2z, T2w);
701 T2J = VADD(T2B, T2E);
702 T2K = VMUL(LDK(KP707106781), VSUB(T2I, T2J));
703 T2Z = VMUL(LDK(KP707106781), VADD(T2I, T2J));
704 }
705 ST(&(ri[WS(rs, 10)]), VSUB(T2v, T2G), ms, &(ri[0]));
706 ST(&(ii[WS(rs, 10)]), VSUB(T32, T2Z), ms, &(ii[0]));
707 ST(&(ri[WS(rs, 2)]), VADD(T2v, T2G), ms, &(ri[0]));
708 ST(&(ii[WS(rs, 2)]), VADD(T2Z, T32), ms, &(ii[0]));
709 ST(&(ri[WS(rs, 14)]), VSUB(T2H, T2K), ms, &(ri[0]));
710 ST(&(ii[WS(rs, 14)]), VSUB(T34, T33), ms, &(ii[0]));
711 ST(&(ri[WS(rs, 6)]), VADD(T2H, T2K), ms, &(ri[0]));
712 ST(&(ii[WS(rs, 6)]), VADD(T33, T34), ms, &(ii[0]));
713 }
714 {
715 V T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
716 {
717 V T2b, T2e, T36, T39;
718 T2b = VADD(T1t, T1w);
719 T2e = VMUL(LDK(KP707106781), VADD(T2c, T2d));
720 T2f = VADD(T2b, T2e);
721 T2n = VSUB(T2b, T2e);
722 T36 = VMUL(LDK(KP707106781), VADD(T1C, T1H));
723 T39 = VSUB(T37, T38);
724 T3a = VADD(T36, T39);
725 T3c = VSUB(T39, T36);
726 }
727 {
728 V T2i, T2l, T2o, T2p;
729 T2i = VFMA(LDK(KP382683432), T2g, VMUL(LDK(KP923879532), T2h));
730 T2l = VFNMS(LDK(KP382683432), T2k, VMUL(LDK(KP923879532), T2j));
731 T2m = VADD(T2i, T2l);
732 T3b = VSUB(T2l, T2i);
733 T2o = VFNMS(LDK(KP382683432), T2h, VMUL(LDK(KP923879532), T2g));
734 T2p = VFMA(LDK(KP923879532), T2k, VMUL(LDK(KP382683432), T2j));
735 T2q = VSUB(T2o, T2p);
736 T35 = VADD(T2o, T2p);
737 }
738 ST(&(ri[WS(rs, 9)]), VSUB(T2f, T2m), ms, &(ri[WS(rs, 1)]));
739 ST(&(ii[WS(rs, 9)]), VSUB(T3a, T35), ms, &(ii[WS(rs, 1)]));
740 ST(&(ri[WS(rs, 1)]), VADD(T2f, T2m), ms, &(ri[WS(rs, 1)]));
741 ST(&(ii[WS(rs, 1)]), VADD(T35, T3a), ms, &(ii[WS(rs, 1)]));
742 ST(&(ri[WS(rs, 13)]), VSUB(T2n, T2q), ms, &(ri[WS(rs, 1)]));
743 ST(&(ii[WS(rs, 13)]), VSUB(T3c, T3b), ms, &(ii[WS(rs, 1)]));
744 ST(&(ri[WS(rs, 5)]), VADD(T2n, T2q), ms, &(ri[WS(rs, 1)]));
745 ST(&(ii[WS(rs, 5)]), VADD(T3b, T3c), ms, &(ii[WS(rs, 1)]));
746 }
747 {
748 V TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
749 {
750 V Tj, TG, T2Q, T2V;
751 Tj = VADD(T7, Ti);
752 TG = VADD(Tu, TF);
753 TH = VADD(Tj, TG);
754 T2L = VSUB(Tj, TG);
755 T2Q = VADD(T2s, T2t);
756 T2V = VADD(T2R, T2U);
757 T2W = VADD(T2Q, T2V);
758 T2Y = VSUB(T2V, T2Q);
759 }
760 {
761 V T14, T1r, T2M, T2N;
762 T14 = VADD(TS, T13);
763 T1r = VADD(T1f, T1q);
764 T1s = VADD(T14, T1r);
765 T2X = VSUB(T1r, T14);
766 T2M = VADD(T2x, T2y);
767 T2N = VADD(T2C, T2D);
768 T2O = VSUB(T2M, T2N);
769 T2P = VADD(T2M, T2N);
770 }
771 ST(&(ri[WS(rs, 8)]), VSUB(TH, T1s), ms, &(ri[0]));
772 ST(&(ii[WS(rs, 8)]), VSUB(T2W, T2P), ms, &(ii[0]));
773 ST(&(ri[0]), VADD(TH, T1s), ms, &(ri[0]));
774 ST(&(ii[0]), VADD(T2P, T2W), ms, &(ii[0]));
775 ST(&(ri[WS(rs, 12)]), VSUB(T2L, T2O), ms, &(ri[0]));
776 ST(&(ii[WS(rs, 12)]), VSUB(T2Y, T2X), ms, &(ii[0]));
777 ST(&(ri[WS(rs, 4)]), VADD(T2L, T2O), ms, &(ri[0]));
778 ST(&(ii[WS(rs, 4)]), VADD(T2X, T2Y), ms, &(ii[0]));
779 }
780 }
781 }
782 VLEAVE();
783 }
784
785 static const tw_instr twinstr[] = {
786 VTW(0, 1),
787 VTW(0, 2),
788 VTW(0, 3),
789 VTW(0, 4),
790 VTW(0, 5),
791 VTW(0, 6),
792 VTW(0, 7),
793 VTW(0, 8),
794 VTW(0, 9),
795 VTW(0, 10),
796 VTW(0, 11),
797 VTW(0, 12),
798 VTW(0, 13),
799 VTW(0, 14),
800 VTW(0, 15),
801 {TW_NEXT, (2 * VL), 0}
802 };
803
804 static const ct_desc desc = { 16, XSIMD_STRING("t1sv_16"), twinstr, &GENUS, {136, 46, 38, 0}, 0, 0, 0 };
805
806 void XSIMD(codelet_t1sv_16) (planner *p) {
807 X(kdft_dit_register) (p, t1sv_16, &desc);
808 }
809 #endif /* HAVE_FMA */