comparison src/fftw-3.3.8/dft/simd/common/t1sv_16.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:06:10 EDT 2018 */
23
24 #include "dft/codelet-dft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1sv_16 -include dft/simd/ts.h */
29
30 /*
31 * This function contains 174 FP additions, 100 FP multiplications,
32 * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
33 * 60 stack variables, 3 constants, and 64 memory accesses
34 */
35 #include "dft/simd/ts.h"
36
37 static void t1sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
42 {
43 INT m;
44 for (m = mb, W = W + (mb * 30); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 30), MAKE_VOLATILE_STRIDE(32, rs)) {
45 V T8, T3z, T1I, T3o, T1s, T35, T2o, T2r, T1F, T36, T2p, T2w, Tl, T3A, T1N;
46 V T3k, Tz, T2V, T1T, T1U, T11, T30, T29, T2c, T1e, T31, T2a, T2h, TM, T2W;
47 V T1W, T21;
48 {
49 V T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5;
50 T1 = LD(&(ri[0]), ms, &(ri[0]));
51 T3n = LD(&(ii[0]), ms, &(ii[0]));
52 T3 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
53 T6 = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
54 T2 = LDW(&(W[TWVL * 14]));
55 T4 = VMUL(T2, T3);
56 T3l = VMUL(T2, T6);
57 T5 = LDW(&(W[TWVL * 15]));
58 T7 = VFMA(T5, T6, T4);
59 T3m = VFNMS(T5, T3, T3l);
60 T8 = VADD(T1, T7);
61 T3z = VSUB(T3n, T3m);
62 T1I = VSUB(T1, T7);
63 T3o = VADD(T3m, T3n);
64 }
65 {
66 V T1h, T1k, T1i, T2k, T1n, T1q, T1o, T2m, T1g, T1m;
67 T1h = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
68 T1k = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
69 T1g = LDW(&(W[TWVL * 28]));
70 T1i = VMUL(T1g, T1h);
71 T2k = VMUL(T1g, T1k);
72 T1n = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
73 T1q = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
74 T1m = LDW(&(W[TWVL * 12]));
75 T1o = VMUL(T1m, T1n);
76 T2m = VMUL(T1m, T1q);
77 {
78 V T1l, T2l, T1r, T2n, T1j, T1p;
79 T1j = LDW(&(W[TWVL * 29]));
80 T1l = VFMA(T1j, T1k, T1i);
81 T2l = VFNMS(T1j, T1h, T2k);
82 T1p = LDW(&(W[TWVL * 13]));
83 T1r = VFMA(T1p, T1q, T1o);
84 T2n = VFNMS(T1p, T1n, T2m);
85 T1s = VADD(T1l, T1r);
86 T35 = VADD(T2l, T2n);
87 T2o = VSUB(T2l, T2n);
88 T2r = VSUB(T1l, T1r);
89 }
90 }
91 {
92 V T1u, T1x, T1v, T2s, T1A, T1D, T1B, T2u, T1t, T1z;
93 T1u = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
94 T1x = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
95 T1t = LDW(&(W[TWVL * 4]));
96 T1v = VMUL(T1t, T1u);
97 T2s = VMUL(T1t, T1x);
98 T1A = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
99 T1D = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
100 T1z = LDW(&(W[TWVL * 20]));
101 T1B = VMUL(T1z, T1A);
102 T2u = VMUL(T1z, T1D);
103 {
104 V T1y, T2t, T1E, T2v, T1w, T1C;
105 T1w = LDW(&(W[TWVL * 5]));
106 T1y = VFMA(T1w, T1x, T1v);
107 T2t = VFNMS(T1w, T1u, T2s);
108 T1C = LDW(&(W[TWVL * 21]));
109 T1E = VFMA(T1C, T1D, T1B);
110 T2v = VFNMS(T1C, T1A, T2u);
111 T1F = VADD(T1y, T1E);
112 T36 = VADD(T2t, T2v);
113 T2p = VSUB(T1y, T1E);
114 T2w = VSUB(T2t, T2v);
115 }
116 }
117 {
118 V Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf;
119 Ta = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
120 Td = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
121 T9 = LDW(&(W[TWVL * 6]));
122 Tb = VMUL(T9, Ta);
123 T1J = VMUL(T9, Td);
124 Tg = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
125 Tj = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
126 Tf = LDW(&(W[TWVL * 22]));
127 Th = VMUL(Tf, Tg);
128 T1L = VMUL(Tf, Tj);
129 {
130 V Te, T1K, Tk, T1M, Tc, Ti;
131 Tc = LDW(&(W[TWVL * 7]));
132 Te = VFMA(Tc, Td, Tb);
133 T1K = VFNMS(Tc, Ta, T1J);
134 Ti = LDW(&(W[TWVL * 23]));
135 Tk = VFMA(Ti, Tj, Th);
136 T1M = VFNMS(Ti, Tg, T1L);
137 Tl = VADD(Te, Tk);
138 T3A = VSUB(Te, Tk);
139 T1N = VSUB(T1K, T1M);
140 T3k = VADD(T1K, T1M);
141 }
142 }
143 {
144 V To, Tr, Tp, T1P, Tu, Tx, Tv, T1R, Tn, Tt;
145 To = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
146 Tr = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
147 Tn = LDW(&(W[TWVL * 2]));
148 Tp = VMUL(Tn, To);
149 T1P = VMUL(Tn, Tr);
150 Tu = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
151 Tx = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
152 Tt = LDW(&(W[TWVL * 18]));
153 Tv = VMUL(Tt, Tu);
154 T1R = VMUL(Tt, Tx);
155 {
156 V Ts, T1Q, Ty, T1S, Tq, Tw;
157 Tq = LDW(&(W[TWVL * 3]));
158 Ts = VFMA(Tq, Tr, Tp);
159 T1Q = VFNMS(Tq, To, T1P);
160 Tw = LDW(&(W[TWVL * 19]));
161 Ty = VFMA(Tw, Tx, Tv);
162 T1S = VFNMS(Tw, Tu, T1R);
163 Tz = VADD(Ts, Ty);
164 T2V = VADD(T1Q, T1S);
165 T1T = VSUB(T1Q, T1S);
166 T1U = VSUB(Ts, Ty);
167 }
168 }
169 {
170 V TQ, TT, TR, T25, TW, TZ, TX, T27, TP, TV;
171 TQ = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
172 TT = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
173 TP = LDW(&(W[0]));
174 TR = VMUL(TP, TQ);
175 T25 = VMUL(TP, TT);
176 TW = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
177 TZ = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
178 TV = LDW(&(W[TWVL * 16]));
179 TX = VMUL(TV, TW);
180 T27 = VMUL(TV, TZ);
181 {
182 V TU, T26, T10, T28, TS, TY;
183 TS = LDW(&(W[TWVL * 1]));
184 TU = VFMA(TS, TT, TR);
185 T26 = VFNMS(TS, TQ, T25);
186 TY = LDW(&(W[TWVL * 17]));
187 T10 = VFMA(TY, TZ, TX);
188 T28 = VFNMS(TY, TW, T27);
189 T11 = VADD(TU, T10);
190 T30 = VADD(T26, T28);
191 T29 = VSUB(T26, T28);
192 T2c = VSUB(TU, T10);
193 }
194 }
195 {
196 V T13, T16, T14, T2d, T19, T1c, T1a, T2f, T12, T18;
197 T13 = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
198 T16 = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
199 T12 = LDW(&(W[TWVL * 8]));
200 T14 = VMUL(T12, T13);
201 T2d = VMUL(T12, T16);
202 T19 = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
203 T1c = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
204 T18 = LDW(&(W[TWVL * 24]));
205 T1a = VMUL(T18, T19);
206 T2f = VMUL(T18, T1c);
207 {
208 V T17, T2e, T1d, T2g, T15, T1b;
209 T15 = LDW(&(W[TWVL * 9]));
210 T17 = VFMA(T15, T16, T14);
211 T2e = VFNMS(T15, T13, T2d);
212 T1b = LDW(&(W[TWVL * 25]));
213 T1d = VFMA(T1b, T1c, T1a);
214 T2g = VFNMS(T1b, T19, T2f);
215 T1e = VADD(T17, T1d);
216 T31 = VADD(T2e, T2g);
217 T2a = VSUB(T17, T1d);
218 T2h = VSUB(T2e, T2g);
219 }
220 }
221 {
222 V TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG;
223 TB = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
224 TE = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
225 TA = LDW(&(W[TWVL * 26]));
226 TC = VMUL(TA, TB);
227 T1X = VMUL(TA, TE);
228 TH = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
229 TK = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
230 TG = LDW(&(W[TWVL * 10]));
231 TI = VMUL(TG, TH);
232 T1Z = VMUL(TG, TK);
233 {
234 V TF, T1Y, TL, T20, TD, TJ;
235 TD = LDW(&(W[TWVL * 27]));
236 TF = VFMA(TD, TE, TC);
237 T1Y = VFNMS(TD, TB, T1X);
238 TJ = LDW(&(W[TWVL * 11]));
239 TL = VFMA(TJ, TK, TI);
240 T20 = VFNMS(TJ, TH, T1Z);
241 TM = VADD(TF, TL);
242 T2W = VADD(T1Y, T20);
243 T1W = VSUB(TF, TL);
244 T21 = VSUB(T1Y, T20);
245 }
246 }
247 {
248 V TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i;
249 {
250 V Tm, TN, T3j, T3p;
251 Tm = VADD(T8, Tl);
252 TN = VADD(Tz, TM);
253 TO = VADD(Tm, TN);
254 T3e = VSUB(Tm, TN);
255 T3j = VADD(T2V, T2W);
256 T3p = VADD(T3k, T3o);
257 T3q = VADD(T3j, T3p);
258 T3s = VSUB(T3p, T3j);
259 }
260 {
261 V T1f, T1G, T3f, T3g;
262 T1f = VADD(T11, T1e);
263 T1G = VADD(T1s, T1F);
264 T1H = VADD(T1f, T1G);
265 T3r = VSUB(T1G, T1f);
266 T3f = VADD(T30, T31);
267 T3g = VADD(T35, T36);
268 T3h = VSUB(T3f, T3g);
269 T3i = VADD(T3f, T3g);
270 }
271 ST(&(ri[WS(rs, 8)]), VSUB(TO, T1H), ms, &(ri[0]));
272 ST(&(ii[WS(rs, 8)]), VSUB(T3q, T3i), ms, &(ii[0]));
273 ST(&(ri[0]), VADD(TO, T1H), ms, &(ri[0]));
274 ST(&(ii[0]), VADD(T3i, T3q), ms, &(ii[0]));
275 ST(&(ri[WS(rs, 12)]), VSUB(T3e, T3h), ms, &(ri[0]));
276 ST(&(ii[WS(rs, 12)]), VSUB(T3s, T3r), ms, &(ii[0]));
277 ST(&(ri[WS(rs, 4)]), VADD(T3e, T3h), ms, &(ri[0]));
278 ST(&(ii[WS(rs, 4)]), VADD(T3r, T3s), ms, &(ii[0]));
279 }
280 {
281 V T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c;
282 {
283 V T2U, T2X, T3t, T3u;
284 T2U = VSUB(T8, Tl);
285 T2X = VSUB(T2V, T2W);
286 T2Y = VADD(T2U, T2X);
287 T3a = VSUB(T2U, T2X);
288 T3t = VSUB(TM, Tz);
289 T3u = VSUB(T3o, T3k);
290 T3v = VADD(T3t, T3u);
291 T3x = VSUB(T3u, T3t);
292 }
293 {
294 V T2Z, T32, T34, T37;
295 T2Z = VSUB(T11, T1e);
296 T32 = VSUB(T30, T31);
297 T33 = VADD(T2Z, T32);
298 T3b = VSUB(T32, T2Z);
299 T34 = VSUB(T1s, T1F);
300 T37 = VSUB(T35, T36);
301 T38 = VSUB(T34, T37);
302 T3c = VADD(T34, T37);
303 }
304 {
305 V T39, T3w, T3d, T3y;
306 T39 = VADD(T33, T38);
307 ST(&(ri[WS(rs, 10)]), VFNMS(LDK(KP707106781), T39, T2Y), ms, &(ri[0]));
308 ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP707106781), T39, T2Y), ms, &(ri[0]));
309 T3w = VADD(T3b, T3c);
310 ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP707106781), T3w, T3v), ms, &(ii[0]));
311 ST(&(ii[WS(rs, 10)]), VFNMS(LDK(KP707106781), T3w, T3v), ms, &(ii[0]));
312 T3d = VSUB(T3b, T3c);
313 ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3d, T3a), ms, &(ri[0]));
314 ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP707106781), T3d, T3a), ms, &(ri[0]));
315 T3y = VSUB(T38, T33);
316 ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP707106781), T3y, T3x), ms, &(ii[0]));
317 ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3y, T3x), ms, &(ii[0]));
318 }
319 }
320 {
321 V T1O, T3B, T3H, T2E, T23, T3C, T2O, T2S, T2H, T3I, T2j, T2B, T2L, T2R, T2y;
322 V T2C;
323 {
324 V T1V, T22, T2b, T2i;
325 T1O = VSUB(T1I, T1N);
326 T3B = VSUB(T3z, T3A);
327 T3H = VADD(T3A, T3z);
328 T2E = VADD(T1I, T1N);
329 T1V = VSUB(T1T, T1U);
330 T22 = VADD(T1W, T21);
331 T23 = VSUB(T1V, T22);
332 T3C = VADD(T1V, T22);
333 {
334 V T2M, T2N, T2F, T2G;
335 T2M = VADD(T2r, T2w);
336 T2N = VSUB(T2o, T2p);
337 T2O = VFNMS(LDK(KP414213562), T2N, T2M);
338 T2S = VFMA(LDK(KP414213562), T2M, T2N);
339 T2F = VADD(T1U, T1T);
340 T2G = VSUB(T1W, T21);
341 T2H = VADD(T2F, T2G);
342 T3I = VSUB(T2G, T2F);
343 }
344 T2b = VADD(T29, T2a);
345 T2i = VSUB(T2c, T2h);
346 T2j = VFMA(LDK(KP414213562), T2i, T2b);
347 T2B = VFNMS(LDK(KP414213562), T2b, T2i);
348 {
349 V T2J, T2K, T2q, T2x;
350 T2J = VADD(T2c, T2h);
351 T2K = VSUB(T29, T2a);
352 T2L = VFMA(LDK(KP414213562), T2K, T2J);
353 T2R = VFNMS(LDK(KP414213562), T2J, T2K);
354 T2q = VADD(T2o, T2p);
355 T2x = VSUB(T2r, T2w);
356 T2y = VFNMS(LDK(KP414213562), T2x, T2q);
357 T2C = VFMA(LDK(KP414213562), T2q, T2x);
358 }
359 }
360 {
361 V T24, T2z, T3J, T3K;
362 T24 = VFMA(LDK(KP707106781), T23, T1O);
363 T2z = VSUB(T2j, T2y);
364 ST(&(ri[WS(rs, 11)]), VFNMS(LDK(KP923879532), T2z, T24), ms, &(ri[WS(rs, 1)]));
365 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP923879532), T2z, T24), ms, &(ri[WS(rs, 1)]));
366 T3J = VFMA(LDK(KP707106781), T3I, T3H);
367 T3K = VSUB(T2C, T2B);
368 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP923879532), T3K, T3J), ms, &(ii[WS(rs, 1)]));
369 ST(&(ii[WS(rs, 11)]), VFNMS(LDK(KP923879532), T3K, T3J), ms, &(ii[WS(rs, 1)]));
370 }
371 {
372 V T2A, T2D, T3L, T3M;
373 T2A = VFNMS(LDK(KP707106781), T23, T1O);
374 T2D = VADD(T2B, T2C);
375 ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP923879532), T2D, T2A), ms, &(ri[WS(rs, 1)]));
376 ST(&(ri[WS(rs, 15)]), VFMA(LDK(KP923879532), T2D, T2A), ms, &(ri[WS(rs, 1)]));
377 T3L = VFNMS(LDK(KP707106781), T3I, T3H);
378 T3M = VADD(T2j, T2y);
379 ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP923879532), T3M, T3L), ms, &(ii[WS(rs, 1)]));
380 ST(&(ii[WS(rs, 15)]), VFMA(LDK(KP923879532), T3M, T3L), ms, &(ii[WS(rs, 1)]));
381 }
382 {
383 V T2I, T2P, T3D, T3E;
384 T2I = VFMA(LDK(KP707106781), T2H, T2E);
385 T2P = VADD(T2L, T2O);
386 ST(&(ri[WS(rs, 9)]), VFNMS(LDK(KP923879532), T2P, T2I), ms, &(ri[WS(rs, 1)]));
387 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP923879532), T2P, T2I), ms, &(ri[WS(rs, 1)]));
388 T3D = VFMA(LDK(KP707106781), T3C, T3B);
389 T3E = VADD(T2R, T2S);
390 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP923879532), T3E, T3D), ms, &(ii[WS(rs, 1)]));
391 ST(&(ii[WS(rs, 9)]), VFNMS(LDK(KP923879532), T3E, T3D), ms, &(ii[WS(rs, 1)]));
392 }
393 {
394 V T2Q, T2T, T3F, T3G;
395 T2Q = VFNMS(LDK(KP707106781), T2H, T2E);
396 T2T = VSUB(T2R, T2S);
397 ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP923879532), T2T, T2Q), ms, &(ri[WS(rs, 1)]));
398 ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP923879532), T2T, T2Q), ms, &(ri[WS(rs, 1)]));
399 T3F = VFNMS(LDK(KP707106781), T3C, T3B);
400 T3G = VSUB(T2O, T2L);
401 ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP923879532), T3G, T3F), ms, &(ii[WS(rs, 1)]));
402 ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP923879532), T3G, T3F), ms, &(ii[WS(rs, 1)]));
403 }
404 }
405 }
406 }
407 VLEAVE();
408 }
409
410 static const tw_instr twinstr[] = {
411 VTW(0, 1),
412 VTW(0, 2),
413 VTW(0, 3),
414 VTW(0, 4),
415 VTW(0, 5),
416 VTW(0, 6),
417 VTW(0, 7),
418 VTW(0, 8),
419 VTW(0, 9),
420 VTW(0, 10),
421 VTW(0, 11),
422 VTW(0, 12),
423 VTW(0, 13),
424 VTW(0, 14),
425 VTW(0, 15),
426 {TW_NEXT, (2 * VL), 0}
427 };
428
429 static const ct_desc desc = { 16, XSIMD_STRING("t1sv_16"), twinstr, &GENUS, {104, 30, 70, 0}, 0, 0, 0 };
430
431 void XSIMD(codelet_t1sv_16) (planner *p) {
432 X(kdft_dit_register) (p, t1sv_16, &desc);
433 }
434 #else
435
436 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1sv_16 -include dft/simd/ts.h */
437
438 /*
439 * This function contains 174 FP additions, 84 FP multiplications,
440 * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
441 * 52 stack variables, 3 constants, and 64 memory accesses
442 */
443 #include "dft/simd/ts.h"
444
445 static void t1sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
446 {
447 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
448 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
449 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
450 {
451 INT m;
452 for (m = mb, W = W + (mb * 30); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 30), MAKE_VOLATILE_STRIDE(32, rs)) {
453 V T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
454 V T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
455 V T2y, T2z, T1O, T2g, T1T, T2h;
456 {
457 V T1, T2T, T6, T2S;
458 T1 = LD(&(ri[0]), ms, &(ri[0]));
459 T2T = LD(&(ii[0]), ms, &(ii[0]));
460 {
461 V T3, T5, T2, T4;
462 T3 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
463 T5 = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
464 T2 = LDW(&(W[TWVL * 14]));
465 T4 = LDW(&(W[TWVL * 15]));
466 T6 = VFMA(T2, T3, VMUL(T4, T5));
467 T2S = VFNMS(T4, T3, VMUL(T2, T5));
468 }
469 T7 = VADD(T1, T6);
470 T37 = VSUB(T2T, T2S);
471 T1t = VSUB(T1, T6);
472 T2U = VADD(T2S, T2T);
473 }
474 {
475 V Tc, T1u, Th, T1v;
476 {
477 V T9, Tb, T8, Ta;
478 T9 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
479 Tb = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
480 T8 = LDW(&(W[TWVL * 6]));
481 Ta = LDW(&(W[TWVL * 7]));
482 Tc = VFMA(T8, T9, VMUL(Ta, Tb));
483 T1u = VFNMS(Ta, T9, VMUL(T8, Tb));
484 }
485 {
486 V Te, Tg, Td, Tf;
487 Te = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
488 Tg = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
489 Td = LDW(&(W[TWVL * 22]));
490 Tf = LDW(&(W[TWVL * 23]));
491 Th = VFMA(Td, Te, VMUL(Tf, Tg));
492 T1v = VFNMS(Tf, Te, VMUL(Td, Tg));
493 }
494 Ti = VADD(Tc, Th);
495 T38 = VSUB(Tc, Th);
496 T1w = VSUB(T1u, T1v);
497 T2R = VADD(T1u, T1v);
498 }
499 {
500 V To, T1y, Tt, T1z, T1A, T1B;
501 {
502 V Tl, Tn, Tk, Tm;
503 Tl = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
504 Tn = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
505 Tk = LDW(&(W[TWVL * 2]));
506 Tm = LDW(&(W[TWVL * 3]));
507 To = VFMA(Tk, Tl, VMUL(Tm, Tn));
508 T1y = VFNMS(Tm, Tl, VMUL(Tk, Tn));
509 }
510 {
511 V Tq, Ts, Tp, Tr;
512 Tq = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
513 Ts = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
514 Tp = LDW(&(W[TWVL * 18]));
515 Tr = LDW(&(W[TWVL * 19]));
516 Tt = VFMA(Tp, Tq, VMUL(Tr, Ts));
517 T1z = VFNMS(Tr, Tq, VMUL(Tp, Ts));
518 }
519 Tu = VADD(To, Tt);
520 T2s = VADD(T1y, T1z);
521 T1A = VSUB(T1y, T1z);
522 T1B = VSUB(To, Tt);
523 T1C = VSUB(T1A, T1B);
524 T2c = VADD(T1B, T1A);
525 }
526 {
527 V Tz, T1E, TE, T1F, T1D, T1G;
528 {
529 V Tw, Ty, Tv, Tx;
530 Tw = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
531 Ty = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
532 Tv = LDW(&(W[TWVL * 26]));
533 Tx = LDW(&(W[TWVL * 27]));
534 Tz = VFMA(Tv, Tw, VMUL(Tx, Ty));
535 T1E = VFNMS(Tx, Tw, VMUL(Tv, Ty));
536 }
537 {
538 V TB, TD, TA, TC;
539 TB = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
540 TD = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
541 TA = LDW(&(W[TWVL * 10]));
542 TC = LDW(&(W[TWVL * 11]));
543 TE = VFMA(TA, TB, VMUL(TC, TD));
544 T1F = VFNMS(TC, TB, VMUL(TA, TD));
545 }
546 TF = VADD(Tz, TE);
547 T2t = VADD(T1E, T1F);
548 T1D = VSUB(Tz, TE);
549 T1G = VSUB(T1E, T1F);
550 T1H = VADD(T1D, T1G);
551 T2d = VSUB(T1D, T1G);
552 }
553 {
554 V T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
555 {
556 V T16, T18, T15, T17;
557 T16 = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
558 T18 = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
559 T15 = LDW(&(W[TWVL * 28]));
560 T17 = LDW(&(W[TWVL * 29]));
561 T19 = VFMA(T15, T16, VMUL(T17, T18));
562 T20 = VFNMS(T17, T16, VMUL(T15, T18));
563 }
564 {
565 V T1m, T1o, T1l, T1n;
566 T1m = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
567 T1o = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
568 T1l = LDW(&(W[TWVL * 20]));
569 T1n = LDW(&(W[TWVL * 21]));
570 T1p = VFMA(T1l, T1m, VMUL(T1n, T1o));
571 T1X = VFNMS(T1n, T1m, VMUL(T1l, T1o));
572 }
573 {
574 V T1b, T1d, T1a, T1c;
575 T1b = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
576 T1d = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
577 T1a = LDW(&(W[TWVL * 12]));
578 T1c = LDW(&(W[TWVL * 13]));
579 T1e = VFMA(T1a, T1b, VMUL(T1c, T1d));
580 T21 = VFNMS(T1c, T1b, VMUL(T1a, T1d));
581 }
582 {
583 V T1h, T1j, T1g, T1i;
584 T1h = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
585 T1j = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
586 T1g = LDW(&(W[TWVL * 4]));
587 T1i = LDW(&(W[TWVL * 5]));
588 T1k = VFMA(T1g, T1h, VMUL(T1i, T1j));
589 T1W = VFNMS(T1i, T1h, VMUL(T1g, T1j));
590 }
591 T1f = VADD(T19, T1e);
592 T1q = VADD(T1k, T1p);
593 T2B = VSUB(T1f, T1q);
594 T2C = VADD(T20, T21);
595 T2D = VADD(T1W, T1X);
596 T2E = VSUB(T2C, T2D);
597 {
598 V T1V, T1Y, T22, T23;
599 T1V = VSUB(T19, T1e);
600 T1Y = VSUB(T1W, T1X);
601 T1Z = VSUB(T1V, T1Y);
602 T2j = VADD(T1V, T1Y);
603 T22 = VSUB(T20, T21);
604 T23 = VSUB(T1k, T1p);
605 T24 = VADD(T22, T23);
606 T2k = VSUB(T22, T23);
607 }
608 }
609 {
610 V TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
611 {
612 V TJ, TL, TI, TK;
613 TJ = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
614 TL = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
615 TI = LDW(&(W[0]));
616 TK = LDW(&(W[TWVL * 1]));
617 TM = VFMA(TI, TJ, VMUL(TK, TL));
618 T1K = VFNMS(TK, TJ, VMUL(TI, TL));
619 }
620 {
621 V TZ, T11, TY, T10;
622 TZ = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
623 T11 = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
624 TY = LDW(&(W[TWVL * 24]));
625 T10 = LDW(&(W[TWVL * 25]));
626 T12 = VFMA(TY, TZ, VMUL(T10, T11));
627 T1R = VFNMS(T10, TZ, VMUL(TY, T11));
628 }
629 {
630 V TO, TQ, TN, TP;
631 TO = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
632 TQ = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
633 TN = LDW(&(W[TWVL * 16]));
634 TP = LDW(&(W[TWVL * 17]));
635 TR = VFMA(TN, TO, VMUL(TP, TQ));
636 T1L = VFNMS(TP, TO, VMUL(TN, TQ));
637 }
638 {
639 V TU, TW, TT, TV;
640 TU = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
641 TW = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
642 TT = LDW(&(W[TWVL * 8]));
643 TV = LDW(&(W[TWVL * 9]));
644 TX = VFMA(TT, TU, VMUL(TV, TW));
645 T1Q = VFNMS(TV, TU, VMUL(TT, TW));
646 }
647 TS = VADD(TM, TR);
648 T13 = VADD(TX, T12);
649 T2w = VSUB(TS, T13);
650 T2x = VADD(T1K, T1L);
651 T2y = VADD(T1Q, T1R);
652 T2z = VSUB(T2x, T2y);
653 {
654 V T1M, T1N, T1P, T1S;
655 T1M = VSUB(T1K, T1L);
656 T1N = VSUB(TX, T12);
657 T1O = VADD(T1M, T1N);
658 T2g = VSUB(T1M, T1N);
659 T1P = VSUB(TM, TR);
660 T1S = VSUB(T1Q, T1R);
661 T1T = VSUB(T1P, T1S);
662 T2h = VADD(T1P, T1S);
663 }
664 }
665 {
666 V T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
667 {
668 V T1x, T1I, T3e, T3f;
669 T1x = VSUB(T1t, T1w);
670 T1I = VMUL(LDK(KP707106781), VSUB(T1C, T1H));
671 T1J = VADD(T1x, T1I);
672 T27 = VSUB(T1x, T1I);
673 T3e = VMUL(LDK(KP707106781), VSUB(T2d, T2c));
674 T3f = VADD(T38, T37);
675 T3g = VADD(T3e, T3f);
676 T3i = VSUB(T3f, T3e);
677 }
678 {
679 V T1U, T25, T28, T29;
680 T1U = VFMA(LDK(KP923879532), T1O, VMUL(LDK(KP382683432), T1T));
681 T25 = VFNMS(LDK(KP923879532), T24, VMUL(LDK(KP382683432), T1Z));
682 T26 = VADD(T1U, T25);
683 T3h = VSUB(T25, T1U);
684 T28 = VFNMS(LDK(KP923879532), T1T, VMUL(LDK(KP382683432), T1O));
685 T29 = VFMA(LDK(KP382683432), T24, VMUL(LDK(KP923879532), T1Z));
686 T2a = VSUB(T28, T29);
687 T3d = VADD(T28, T29);
688 }
689 ST(&(ri[WS(rs, 11)]), VSUB(T1J, T26), ms, &(ri[WS(rs, 1)]));
690 ST(&(ii[WS(rs, 11)]), VSUB(T3g, T3d), ms, &(ii[WS(rs, 1)]));
691 ST(&(ri[WS(rs, 3)]), VADD(T1J, T26), ms, &(ri[WS(rs, 1)]));
692 ST(&(ii[WS(rs, 3)]), VADD(T3d, T3g), ms, &(ii[WS(rs, 1)]));
693 ST(&(ri[WS(rs, 15)]), VSUB(T27, T2a), ms, &(ri[WS(rs, 1)]));
694 ST(&(ii[WS(rs, 15)]), VSUB(T3i, T3h), ms, &(ii[WS(rs, 1)]));
695 ST(&(ri[WS(rs, 7)]), VADD(T27, T2a), ms, &(ri[WS(rs, 1)]));
696 ST(&(ii[WS(rs, 7)]), VADD(T3h, T3i), ms, &(ii[WS(rs, 1)]));
697 }
698 {
699 V T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
700 {
701 V T2r, T2u, T30, T31;
702 T2r = VSUB(T7, Ti);
703 T2u = VSUB(T2s, T2t);
704 T2v = VADD(T2r, T2u);
705 T2H = VSUB(T2r, T2u);
706 T30 = VSUB(TF, Tu);
707 T31 = VSUB(T2U, T2R);
708 T32 = VADD(T30, T31);
709 T34 = VSUB(T31, T30);
710 }
711 {
712 V T2A, T2F, T2I, T2J;
713 T2A = VADD(T2w, T2z);
714 T2F = VSUB(T2B, T2E);
715 T2G = VMUL(LDK(KP707106781), VADD(T2A, T2F));
716 T33 = VMUL(LDK(KP707106781), VSUB(T2F, T2A));
717 T2I = VSUB(T2z, T2w);
718 T2J = VADD(T2B, T2E);
719 T2K = VMUL(LDK(KP707106781), VSUB(T2I, T2J));
720 T2Z = VMUL(LDK(KP707106781), VADD(T2I, T2J));
721 }
722 ST(&(ri[WS(rs, 10)]), VSUB(T2v, T2G), ms, &(ri[0]));
723 ST(&(ii[WS(rs, 10)]), VSUB(T32, T2Z), ms, &(ii[0]));
724 ST(&(ri[WS(rs, 2)]), VADD(T2v, T2G), ms, &(ri[0]));
725 ST(&(ii[WS(rs, 2)]), VADD(T2Z, T32), ms, &(ii[0]));
726 ST(&(ri[WS(rs, 14)]), VSUB(T2H, T2K), ms, &(ri[0]));
727 ST(&(ii[WS(rs, 14)]), VSUB(T34, T33), ms, &(ii[0]));
728 ST(&(ri[WS(rs, 6)]), VADD(T2H, T2K), ms, &(ri[0]));
729 ST(&(ii[WS(rs, 6)]), VADD(T33, T34), ms, &(ii[0]));
730 }
731 {
732 V T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
733 {
734 V T2b, T2e, T36, T39;
735 T2b = VADD(T1t, T1w);
736 T2e = VMUL(LDK(KP707106781), VADD(T2c, T2d));
737 T2f = VADD(T2b, T2e);
738 T2n = VSUB(T2b, T2e);
739 T36 = VMUL(LDK(KP707106781), VADD(T1C, T1H));
740 T39 = VSUB(T37, T38);
741 T3a = VADD(T36, T39);
742 T3c = VSUB(T39, T36);
743 }
744 {
745 V T2i, T2l, T2o, T2p;
746 T2i = VFMA(LDK(KP382683432), T2g, VMUL(LDK(KP923879532), T2h));
747 T2l = VFNMS(LDK(KP382683432), T2k, VMUL(LDK(KP923879532), T2j));
748 T2m = VADD(T2i, T2l);
749 T3b = VSUB(T2l, T2i);
750 T2o = VFNMS(LDK(KP382683432), T2h, VMUL(LDK(KP923879532), T2g));
751 T2p = VFMA(LDK(KP923879532), T2k, VMUL(LDK(KP382683432), T2j));
752 T2q = VSUB(T2o, T2p);
753 T35 = VADD(T2o, T2p);
754 }
755 ST(&(ri[WS(rs, 9)]), VSUB(T2f, T2m), ms, &(ri[WS(rs, 1)]));
756 ST(&(ii[WS(rs, 9)]), VSUB(T3a, T35), ms, &(ii[WS(rs, 1)]));
757 ST(&(ri[WS(rs, 1)]), VADD(T2f, T2m), ms, &(ri[WS(rs, 1)]));
758 ST(&(ii[WS(rs, 1)]), VADD(T35, T3a), ms, &(ii[WS(rs, 1)]));
759 ST(&(ri[WS(rs, 13)]), VSUB(T2n, T2q), ms, &(ri[WS(rs, 1)]));
760 ST(&(ii[WS(rs, 13)]), VSUB(T3c, T3b), ms, &(ii[WS(rs, 1)]));
761 ST(&(ri[WS(rs, 5)]), VADD(T2n, T2q), ms, &(ri[WS(rs, 1)]));
762 ST(&(ii[WS(rs, 5)]), VADD(T3b, T3c), ms, &(ii[WS(rs, 1)]));
763 }
764 {
765 V TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
766 {
767 V Tj, TG, T2Q, T2V;
768 Tj = VADD(T7, Ti);
769 TG = VADD(Tu, TF);
770 TH = VADD(Tj, TG);
771 T2L = VSUB(Tj, TG);
772 T2Q = VADD(T2s, T2t);
773 T2V = VADD(T2R, T2U);
774 T2W = VADD(T2Q, T2V);
775 T2Y = VSUB(T2V, T2Q);
776 }
777 {
778 V T14, T1r, T2M, T2N;
779 T14 = VADD(TS, T13);
780 T1r = VADD(T1f, T1q);
781 T1s = VADD(T14, T1r);
782 T2X = VSUB(T1r, T14);
783 T2M = VADD(T2x, T2y);
784 T2N = VADD(T2C, T2D);
785 T2O = VSUB(T2M, T2N);
786 T2P = VADD(T2M, T2N);
787 }
788 ST(&(ri[WS(rs, 8)]), VSUB(TH, T1s), ms, &(ri[0]));
789 ST(&(ii[WS(rs, 8)]), VSUB(T2W, T2P), ms, &(ii[0]));
790 ST(&(ri[0]), VADD(TH, T1s), ms, &(ri[0]));
791 ST(&(ii[0]), VADD(T2P, T2W), ms, &(ii[0]));
792 ST(&(ri[WS(rs, 12)]), VSUB(T2L, T2O), ms, &(ri[0]));
793 ST(&(ii[WS(rs, 12)]), VSUB(T2Y, T2X), ms, &(ii[0]));
794 ST(&(ri[WS(rs, 4)]), VADD(T2L, T2O), ms, &(ri[0]));
795 ST(&(ii[WS(rs, 4)]), VADD(T2X, T2Y), ms, &(ii[0]));
796 }
797 }
798 }
799 VLEAVE();
800 }
801
802 static const tw_instr twinstr[] = {
803 VTW(0, 1),
804 VTW(0, 2),
805 VTW(0, 3),
806 VTW(0, 4),
807 VTW(0, 5),
808 VTW(0, 6),
809 VTW(0, 7),
810 VTW(0, 8),
811 VTW(0, 9),
812 VTW(0, 10),
813 VTW(0, 11),
814 VTW(0, 12),
815 VTW(0, 13),
816 VTW(0, 14),
817 VTW(0, 15),
818 {TW_NEXT, (2 * VL), 0}
819 };
820
821 static const ct_desc desc = { 16, XSIMD_STRING("t1sv_16"), twinstr, &GENUS, {136, 46, 38, 0}, 0, 0, 0 };
822
823 void XSIMD(codelet_t1sv_16) (planner *p) {
824 X(kdft_dit_register) (p, t1sv_16, &desc);
825 }
826 #endif