comparison src/fftw-3.3.8/dft/simd/common/t1fv_16.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:05:28 EDT 2018 */
23
24 #include "dft/codelet-dft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1fv_16 -include dft/simd/t1f.h */
29
30 /*
31 * This function contains 87 FP additions, 64 FP multiplications,
32 * (or, 53 additions, 30 multiplications, 34 fused multiply/add),
33 * 36 stack variables, 3 constants, and 32 memory accesses
34 */
35 #include "dft/simd/t1f.h"
36
37 static void t1fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
40 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
41 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
42 {
43 INT m;
44 R *x;
45 x = ri;
46 for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
47 V T4, TW, T9, T19, TD, TI, TZ, T1a, Tf, Tk, Tl, T13, T1c, Tq, Tv;
48 V Tw, T16, T1d, T1, T3, T2;
49 T1 = LD(&(x[0]), ms, &(x[0]));
50 T2 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
51 T3 = BYTWJ(&(W[TWVL * 14]), T2);
52 T4 = VADD(T1, T3);
53 TW = VSUB(T1, T3);
54 {
55 V T6, T8, T5, T7;
56 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
57 T6 = BYTWJ(&(W[TWVL * 6]), T5);
58 T7 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
59 T8 = BYTWJ(&(W[TWVL * 22]), T7);
60 T9 = VADD(T6, T8);
61 T19 = VSUB(T6, T8);
62 }
63 {
64 V TA, TH, TC, TF, TX, TY;
65 {
66 V Tz, TG, TB, TE;
67 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
68 TA = BYTWJ(&(W[TWVL * 26]), Tz);
69 TG = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
70 TH = BYTWJ(&(W[TWVL * 18]), TG);
71 TB = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
72 TC = BYTWJ(&(W[TWVL * 10]), TB);
73 TE = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
74 TF = BYTWJ(&(W[TWVL * 2]), TE);
75 }
76 TD = VADD(TA, TC);
77 TI = VADD(TF, TH);
78 TX = VSUB(TF, TH);
79 TY = VSUB(TA, TC);
80 TZ = VADD(TX, TY);
81 T1a = VSUB(TY, TX);
82 }
83 {
84 V Tc, Tj, Te, Th, T11, T12;
85 {
86 V Tb, Ti, Td, Tg;
87 Tb = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
88 Tc = BYTWJ(&(W[0]), Tb);
89 Ti = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
90 Tj = BYTWJ(&(W[TWVL * 24]), Ti);
91 Td = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
92 Te = BYTWJ(&(W[TWVL * 16]), Td);
93 Tg = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
94 Th = BYTWJ(&(W[TWVL * 8]), Tg);
95 }
96 Tf = VADD(Tc, Te);
97 Tk = VADD(Th, Tj);
98 Tl = VSUB(Tf, Tk);
99 T11 = VSUB(Tc, Te);
100 T12 = VSUB(Th, Tj);
101 T13 = VFNMS(LDK(KP414213562), T12, T11);
102 T1c = VFMA(LDK(KP414213562), T11, T12);
103 }
104 {
105 V Tn, Tu, Tp, Ts, T14, T15;
106 {
107 V Tm, Tt, To, Tr;
108 Tm = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
109 Tn = BYTWJ(&(W[TWVL * 28]), Tm);
110 Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
111 Tu = BYTWJ(&(W[TWVL * 20]), Tt);
112 To = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
113 Tp = BYTWJ(&(W[TWVL * 12]), To);
114 Tr = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
115 Ts = BYTWJ(&(W[TWVL * 4]), Tr);
116 }
117 Tq = VADD(Tn, Tp);
118 Tv = VADD(Ts, Tu);
119 Tw = VSUB(Tq, Tv);
120 T14 = VSUB(Tn, Tp);
121 T15 = VSUB(Tu, Ts);
122 T16 = VFNMS(LDK(KP414213562), T15, T14);
123 T1d = VFMA(LDK(KP414213562), T14, T15);
124 }
125 {
126 V Ty, TM, TL, TN;
127 {
128 V Ta, Tx, TJ, TK;
129 Ta = VSUB(T4, T9);
130 Tx = VADD(Tl, Tw);
131 Ty = VFNMS(LDK(KP707106781), Tx, Ta);
132 TM = VFMA(LDK(KP707106781), Tx, Ta);
133 TJ = VSUB(TD, TI);
134 TK = VSUB(Tw, Tl);
135 TL = VFNMS(LDK(KP707106781), TK, TJ);
136 TN = VFMA(LDK(KP707106781), TK, TJ);
137 }
138 ST(&(x[WS(rs, 6)]), VFNMSI(TL, Ty), ms, &(x[0]));
139 ST(&(x[WS(rs, 2)]), VFMAI(TN, TM), ms, &(x[0]));
140 ST(&(x[WS(rs, 10)]), VFMAI(TL, Ty), ms, &(x[0]));
141 ST(&(x[WS(rs, 14)]), VFNMSI(TN, TM), ms, &(x[0]));
142 }
143 {
144 V T1k, T1o, T1n, T1p;
145 {
146 V T1i, T1j, T1l, T1m;
147 T1i = VFNMS(LDK(KP707106781), TZ, TW);
148 T1j = VADD(T1c, T1d);
149 T1k = VFNMS(LDK(KP923879532), T1j, T1i);
150 T1o = VFMA(LDK(KP923879532), T1j, T1i);
151 T1l = VFMA(LDK(KP707106781), T1a, T19);
152 T1m = VSUB(T16, T13);
153 T1n = VFNMS(LDK(KP923879532), T1m, T1l);
154 T1p = VFMA(LDK(KP923879532), T1m, T1l);
155 }
156 ST(&(x[WS(rs, 5)]), VFNMSI(T1n, T1k), ms, &(x[WS(rs, 1)]));
157 ST(&(x[WS(rs, 13)]), VFNMSI(T1p, T1o), ms, &(x[WS(rs, 1)]));
158 ST(&(x[WS(rs, 11)]), VFMAI(T1n, T1k), ms, &(x[WS(rs, 1)]));
159 ST(&(x[WS(rs, 3)]), VFMAI(T1p, T1o), ms, &(x[WS(rs, 1)]));
160 }
161 {
162 V TQ, TU, TT, TV;
163 {
164 V TO, TP, TR, TS;
165 TO = VADD(T4, T9);
166 TP = VADD(TI, TD);
167 TQ = VADD(TO, TP);
168 TU = VSUB(TO, TP);
169 TR = VADD(Tf, Tk);
170 TS = VADD(Tq, Tv);
171 TT = VADD(TR, TS);
172 TV = VSUB(TS, TR);
173 }
174 ST(&(x[WS(rs, 8)]), VSUB(TQ, TT), ms, &(x[0]));
175 ST(&(x[WS(rs, 4)]), VFMAI(TV, TU), ms, &(x[0]));
176 ST(&(x[0]), VADD(TQ, TT), ms, &(x[0]));
177 ST(&(x[WS(rs, 12)]), VFNMSI(TV, TU), ms, &(x[0]));
178 }
179 {
180 V T18, T1g, T1f, T1h;
181 {
182 V T10, T17, T1b, T1e;
183 T10 = VFMA(LDK(KP707106781), TZ, TW);
184 T17 = VADD(T13, T16);
185 T18 = VFNMS(LDK(KP923879532), T17, T10);
186 T1g = VFMA(LDK(KP923879532), T17, T10);
187 T1b = VFNMS(LDK(KP707106781), T1a, T19);
188 T1e = VSUB(T1c, T1d);
189 T1f = VFNMS(LDK(KP923879532), T1e, T1b);
190 T1h = VFMA(LDK(KP923879532), T1e, T1b);
191 }
192 ST(&(x[WS(rs, 9)]), VFNMSI(T1f, T18), ms, &(x[WS(rs, 1)]));
193 ST(&(x[WS(rs, 15)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
194 ST(&(x[WS(rs, 7)]), VFMAI(T1f, T18), ms, &(x[WS(rs, 1)]));
195 ST(&(x[WS(rs, 1)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
196 }
197 }
198 }
199 VLEAVE();
200 }
201
202 static const tw_instr twinstr[] = {
203 VTW(0, 1),
204 VTW(0, 2),
205 VTW(0, 3),
206 VTW(0, 4),
207 VTW(0, 5),
208 VTW(0, 6),
209 VTW(0, 7),
210 VTW(0, 8),
211 VTW(0, 9),
212 VTW(0, 10),
213 VTW(0, 11),
214 VTW(0, 12),
215 VTW(0, 13),
216 VTW(0, 14),
217 VTW(0, 15),
218 {TW_NEXT, VL, 0}
219 };
220
221 static const ct_desc desc = { 16, XSIMD_STRING("t1fv_16"), twinstr, &GENUS, {53, 30, 34, 0}, 0, 0, 0 };
222
223 void XSIMD(codelet_t1fv_16) (planner *p) {
224 X(kdft_dit_register) (p, t1fv_16, &desc);
225 }
226 #else
227
228 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1fv_16 -include dft/simd/t1f.h */
229
230 /*
231 * This function contains 87 FP additions, 42 FP multiplications,
232 * (or, 83 additions, 38 multiplications, 4 fused multiply/add),
233 * 36 stack variables, 3 constants, and 32 memory accesses
234 */
235 #include "dft/simd/t1f.h"
236
237 static void t1fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
238 {
239 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
240 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
241 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
242 {
243 INT m;
244 R *x;
245 x = ri;
246 for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
247 V TJ, T10, TD, T11, T1b, T1c, Ty, TK, T16, T17, T18, Tb, TN, T13, T14;
248 V T15, Tm, TM, TG, TI, TH;
249 TG = LD(&(x[0]), ms, &(x[0]));
250 TH = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
251 TI = BYTWJ(&(W[TWVL * 14]), TH);
252 TJ = VSUB(TG, TI);
253 T10 = VADD(TG, TI);
254 {
255 V TA, TC, Tz, TB;
256 Tz = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
257 TA = BYTWJ(&(W[TWVL * 6]), Tz);
258 TB = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
259 TC = BYTWJ(&(W[TWVL * 22]), TB);
260 TD = VSUB(TA, TC);
261 T11 = VADD(TA, TC);
262 }
263 {
264 V Tp, Tw, Tr, Tu, Ts, Tx;
265 {
266 V To, Tv, Tq, Tt;
267 To = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
268 Tp = BYTWJ(&(W[TWVL * 26]), To);
269 Tv = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
270 Tw = BYTWJ(&(W[TWVL * 18]), Tv);
271 Tq = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
272 Tr = BYTWJ(&(W[TWVL * 10]), Tq);
273 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
274 Tu = BYTWJ(&(W[TWVL * 2]), Tt);
275 }
276 T1b = VADD(Tp, Tr);
277 T1c = VADD(Tu, Tw);
278 Ts = VSUB(Tp, Tr);
279 Tx = VSUB(Tu, Tw);
280 Ty = VMUL(LDK(KP707106781), VSUB(Ts, Tx));
281 TK = VMUL(LDK(KP707106781), VADD(Tx, Ts));
282 }
283 {
284 V T2, T9, T4, T7, T5, Ta;
285 {
286 V T1, T8, T3, T6;
287 T1 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
288 T2 = BYTWJ(&(W[TWVL * 28]), T1);
289 T8 = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
290 T9 = BYTWJ(&(W[TWVL * 20]), T8);
291 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
292 T4 = BYTWJ(&(W[TWVL * 12]), T3);
293 T6 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
294 T7 = BYTWJ(&(W[TWVL * 4]), T6);
295 }
296 T16 = VADD(T2, T4);
297 T17 = VADD(T7, T9);
298 T18 = VSUB(T16, T17);
299 T5 = VSUB(T2, T4);
300 Ta = VSUB(T7, T9);
301 Tb = VFNMS(LDK(KP923879532), Ta, VMUL(LDK(KP382683432), T5));
302 TN = VFMA(LDK(KP923879532), T5, VMUL(LDK(KP382683432), Ta));
303 }
304 {
305 V Td, Tk, Tf, Ti, Tg, Tl;
306 {
307 V Tc, Tj, Te, Th;
308 Tc = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
309 Td = BYTWJ(&(W[0]), Tc);
310 Tj = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
311 Tk = BYTWJ(&(W[TWVL * 24]), Tj);
312 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
313 Tf = BYTWJ(&(W[TWVL * 16]), Te);
314 Th = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
315 Ti = BYTWJ(&(W[TWVL * 8]), Th);
316 }
317 T13 = VADD(Td, Tf);
318 T14 = VADD(Ti, Tk);
319 T15 = VSUB(T13, T14);
320 Tg = VSUB(Td, Tf);
321 Tl = VSUB(Ti, Tk);
322 Tm = VFMA(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), Tl));
323 TM = VFNMS(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tg));
324 }
325 {
326 V T1a, T1g, T1f, T1h;
327 {
328 V T12, T19, T1d, T1e;
329 T12 = VSUB(T10, T11);
330 T19 = VMUL(LDK(KP707106781), VADD(T15, T18));
331 T1a = VADD(T12, T19);
332 T1g = VSUB(T12, T19);
333 T1d = VSUB(T1b, T1c);
334 T1e = VMUL(LDK(KP707106781), VSUB(T18, T15));
335 T1f = VBYI(VADD(T1d, T1e));
336 T1h = VBYI(VSUB(T1e, T1d));
337 }
338 ST(&(x[WS(rs, 14)]), VSUB(T1a, T1f), ms, &(x[0]));
339 ST(&(x[WS(rs, 6)]), VADD(T1g, T1h), ms, &(x[0]));
340 ST(&(x[WS(rs, 2)]), VADD(T1a, T1f), ms, &(x[0]));
341 ST(&(x[WS(rs, 10)]), VSUB(T1g, T1h), ms, &(x[0]));
342 }
343 {
344 V T1k, T1o, T1n, T1p;
345 {
346 V T1i, T1j, T1l, T1m;
347 T1i = VADD(T10, T11);
348 T1j = VADD(T1c, T1b);
349 T1k = VADD(T1i, T1j);
350 T1o = VSUB(T1i, T1j);
351 T1l = VADD(T13, T14);
352 T1m = VADD(T16, T17);
353 T1n = VADD(T1l, T1m);
354 T1p = VBYI(VSUB(T1m, T1l));
355 }
356 ST(&(x[WS(rs, 8)]), VSUB(T1k, T1n), ms, &(x[0]));
357 ST(&(x[WS(rs, 4)]), VADD(T1o, T1p), ms, &(x[0]));
358 ST(&(x[0]), VADD(T1k, T1n), ms, &(x[0]));
359 ST(&(x[WS(rs, 12)]), VSUB(T1o, T1p), ms, &(x[0]));
360 }
361 {
362 V TF, TQ, TP, TR;
363 {
364 V Tn, TE, TL, TO;
365 Tn = VSUB(Tb, Tm);
366 TE = VSUB(Ty, TD);
367 TF = VBYI(VSUB(Tn, TE));
368 TQ = VBYI(VADD(TE, Tn));
369 TL = VADD(TJ, TK);
370 TO = VADD(TM, TN);
371 TP = VSUB(TL, TO);
372 TR = VADD(TL, TO);
373 }
374 ST(&(x[WS(rs, 7)]), VADD(TF, TP), ms, &(x[WS(rs, 1)]));
375 ST(&(x[WS(rs, 15)]), VSUB(TR, TQ), ms, &(x[WS(rs, 1)]));
376 ST(&(x[WS(rs, 9)]), VSUB(TP, TF), ms, &(x[WS(rs, 1)]));
377 ST(&(x[WS(rs, 1)]), VADD(TQ, TR), ms, &(x[WS(rs, 1)]));
378 }
379 {
380 V TU, TY, TX, TZ;
381 {
382 V TS, TT, TV, TW;
383 TS = VSUB(TJ, TK);
384 TT = VADD(Tm, Tb);
385 TU = VADD(TS, TT);
386 TY = VSUB(TS, TT);
387 TV = VADD(TD, Ty);
388 TW = VSUB(TN, TM);
389 TX = VBYI(VADD(TV, TW));
390 TZ = VBYI(VSUB(TW, TV));
391 }
392 ST(&(x[WS(rs, 13)]), VSUB(TU, TX), ms, &(x[WS(rs, 1)]));
393 ST(&(x[WS(rs, 5)]), VADD(TY, TZ), ms, &(x[WS(rs, 1)]));
394 ST(&(x[WS(rs, 3)]), VADD(TU, TX), ms, &(x[WS(rs, 1)]));
395 ST(&(x[WS(rs, 11)]), VSUB(TY, TZ), ms, &(x[WS(rs, 1)]));
396 }
397 }
398 }
399 VLEAVE();
400 }
401
402 static const tw_instr twinstr[] = {
403 VTW(0, 1),
404 VTW(0, 2),
405 VTW(0, 3),
406 VTW(0, 4),
407 VTW(0, 5),
408 VTW(0, 6),
409 VTW(0, 7),
410 VTW(0, 8),
411 VTW(0, 9),
412 VTW(0, 10),
413 VTW(0, 11),
414 VTW(0, 12),
415 VTW(0, 13),
416 VTW(0, 14),
417 VTW(0, 15),
418 {TW_NEXT, VL, 0}
419 };
420
421 static const ct_desc desc = { 16, XSIMD_STRING("t1fv_16"), twinstr, &GENUS, {83, 38, 4, 0}, 0, 0, 0 };
422
423 void XSIMD(codelet_t1fv_16) (planner *p) {
424 X(kdft_dit_register) (p, t1fv_16, &desc);
425 }
426 #endif