comparison src/fftw-3.3.8/dft/simd/common/t1fv_15.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:05:28 EDT 2018 */
23
24 #include "dft/codelet-dft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name t1fv_15 -include dft/simd/t1f.h */
29
30 /*
31 * This function contains 92 FP additions, 77 FP multiplications,
32 * (or, 50 additions, 35 multiplications, 42 fused multiply/add),
33 * 50 stack variables, 8 constants, and 30 memory accesses
34 */
35 #include "dft/simd/t1f.h"
36
37 static void t1fv_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP910592997, +0.910592997310029334643087372129977886038870291);
40 DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
41 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
42 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
43 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
44 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
45 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
46 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
47 {
48 INT m;
49 R *x;
50 x = ri;
51 for (m = mb, W = W + (mb * ((TWVL / VL) * 28)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 28), MAKE_VOLATILE_STRIDE(15, rs)) {
52 V T1b, T7, TP, T12, T15, Tf, Tn, To, T1c, T1d, T1e, TQ, TR, TS, Tw;
53 V TE, TF, TT, TU, TV;
54 {
55 V T1, T5, T3, T4, T2, T6;
56 T1 = LD(&(x[0]), ms, &(x[0]));
57 T4 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
58 T5 = BYTWJ(&(W[TWVL * 18]), T4);
59 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
60 T3 = BYTWJ(&(W[TWVL * 8]), T2);
61 T1b = VSUB(T5, T3);
62 T6 = VADD(T3, T5);
63 T7 = VADD(T1, T6);
64 TP = VFNMS(LDK(KP500000000), T6, T1);
65 }
66 {
67 V T9, Tq, Ty, Th, Te, T10, Tv, T13, TD, T14, Tm, T11;
68 {
69 V T8, Tp, Tx, Tg;
70 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
71 T9 = BYTWJ(&(W[TWVL * 4]), T8);
72 Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
73 Tq = BYTWJ(&(W[TWVL * 10]), Tp);
74 Tx = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
75 Ty = BYTWJ(&(W[TWVL * 16]), Tx);
76 Tg = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
77 Th = BYTWJ(&(W[TWVL * 22]), Tg);
78 }
79 {
80 V Tb, Td, Ta, Tc;
81 Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
82 Tb = BYTWJ(&(W[TWVL * 14]), Ta);
83 Tc = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
84 Td = BYTWJ(&(W[TWVL * 24]), Tc);
85 Te = VADD(Tb, Td);
86 T10 = VSUB(Td, Tb);
87 }
88 {
89 V Ts, Tu, Tr, Tt;
90 Tr = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
91 Ts = BYTWJ(&(W[TWVL * 20]), Tr);
92 Tt = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
93 Tu = BYTWJ(&(W[0]), Tt);
94 Tv = VADD(Ts, Tu);
95 T13 = VSUB(Tu, Ts);
96 }
97 {
98 V TA, TC, Tz, TB;
99 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
100 TA = BYTWJ(&(W[TWVL * 26]), Tz);
101 TB = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
102 TC = BYTWJ(&(W[TWVL * 6]), TB);
103 TD = VADD(TA, TC);
104 T14 = VSUB(TC, TA);
105 }
106 {
107 V Tj, Tl, Ti, Tk;
108 Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
109 Tj = BYTWJ(&(W[TWVL * 2]), Ti);
110 Tk = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
111 Tl = BYTWJ(&(W[TWVL * 12]), Tk);
112 Tm = VADD(Tj, Tl);
113 T11 = VSUB(Tl, Tj);
114 }
115 T12 = VSUB(T10, T11);
116 T15 = VSUB(T13, T14);
117 Tf = VADD(T9, Te);
118 Tn = VADD(Th, Tm);
119 To = VADD(Tf, Tn);
120 T1c = VADD(T10, T11);
121 T1d = VADD(T13, T14);
122 T1e = VADD(T1c, T1d);
123 TQ = VFNMS(LDK(KP500000000), Te, T9);
124 TR = VFNMS(LDK(KP500000000), Tm, Th);
125 TS = VADD(TQ, TR);
126 Tw = VADD(Tq, Tv);
127 TE = VADD(Ty, TD);
128 TF = VADD(Tw, TE);
129 TT = VFNMS(LDK(KP500000000), Tv, Tq);
130 TU = VFNMS(LDK(KP500000000), TD, Ty);
131 TV = VADD(TT, TU);
132 }
133 {
134 V TI, TG, TH, TM, TO, TK, TL, TN, TJ;
135 TI = VSUB(To, TF);
136 TG = VADD(To, TF);
137 TH = VFNMS(LDK(KP250000000), TG, T7);
138 TK = VSUB(Tw, TE);
139 TL = VSUB(Tf, Tn);
140 TM = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TL, TK));
141 TO = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TK, TL));
142 ST(&(x[0]), VADD(T7, TG), ms, &(x[0]));
143 TN = VFMA(LDK(KP559016994), TI, TH);
144 ST(&(x[WS(rs, 6)]), VFNMSI(TO, TN), ms, &(x[0]));
145 ST(&(x[WS(rs, 9)]), VFMAI(TO, TN), ms, &(x[WS(rs, 1)]));
146 TJ = VFNMS(LDK(KP559016994), TI, TH);
147 ST(&(x[WS(rs, 3)]), VFNMSI(TM, TJ), ms, &(x[WS(rs, 1)]));
148 ST(&(x[WS(rs, 12)]), VFMAI(TM, TJ), ms, &(x[0]));
149 }
150 {
151 V T16, T1m, T1u, T1h, T1p, T1a, T1o, TZ, T1t, T1l, T1f, T1g;
152 T16 = VFMA(LDK(KP618033988), T15, T12);
153 T1m = VFNMS(LDK(KP618033988), T12, T15);
154 T1u = VMUL(LDK(KP866025403), VADD(T1b, T1e));
155 T1f = VFNMS(LDK(KP250000000), T1e, T1b);
156 T1g = VSUB(T1c, T1d);
157 T1h = VFMA(LDK(KP559016994), T1g, T1f);
158 T1p = VFNMS(LDK(KP559016994), T1g, T1f);
159 {
160 V T18, T19, TY, TW, TX;
161 T18 = VSUB(TQ, TR);
162 T19 = VSUB(TT, TU);
163 T1a = VFMA(LDK(KP618033988), T19, T18);
164 T1o = VFNMS(LDK(KP618033988), T18, T19);
165 TY = VSUB(TS, TV);
166 TW = VADD(TS, TV);
167 TX = VFNMS(LDK(KP250000000), TW, TP);
168 TZ = VFMA(LDK(KP559016994), TY, TX);
169 T1t = VADD(TP, TW);
170 T1l = VFNMS(LDK(KP559016994), TY, TX);
171 }
172 {
173 V T17, T1i, T1r, T1s;
174 ST(&(x[WS(rs, 5)]), VFNMSI(T1u, T1t), ms, &(x[WS(rs, 1)]));
175 ST(&(x[WS(rs, 10)]), VFMAI(T1u, T1t), ms, &(x[0]));
176 T17 = VFMA(LDK(KP823639103), T16, TZ);
177 T1i = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T1h, T1a));
178 ST(&(x[WS(rs, 1)]), VFNMSI(T1i, T17), ms, &(x[WS(rs, 1)]));
179 ST(&(x[WS(rs, 14)]), VFMAI(T1i, T17), ms, &(x[0]));
180 T1r = VFNMS(LDK(KP823639103), T1m, T1l);
181 T1s = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T1p, T1o));
182 ST(&(x[WS(rs, 8)]), VFNMSI(T1s, T1r), ms, &(x[0]));
183 ST(&(x[WS(rs, 7)]), VFMAI(T1s, T1r), ms, &(x[WS(rs, 1)]));
184 {
185 V T1n, T1q, T1j, T1k;
186 T1n = VFMA(LDK(KP823639103), T1m, T1l);
187 T1q = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T1p, T1o));
188 ST(&(x[WS(rs, 13)]), VFNMSI(T1q, T1n), ms, &(x[WS(rs, 1)]));
189 ST(&(x[WS(rs, 2)]), VFMAI(T1q, T1n), ms, &(x[0]));
190 T1j = VFNMS(LDK(KP823639103), T16, TZ);
191 T1k = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T1h, T1a));
192 ST(&(x[WS(rs, 11)]), VFNMSI(T1k, T1j), ms, &(x[WS(rs, 1)]));
193 ST(&(x[WS(rs, 4)]), VFMAI(T1k, T1j), ms, &(x[0]));
194 }
195 }
196 }
197 }
198 }
199 VLEAVE();
200 }
201
202 static const tw_instr twinstr[] = {
203 VTW(0, 1),
204 VTW(0, 2),
205 VTW(0, 3),
206 VTW(0, 4),
207 VTW(0, 5),
208 VTW(0, 6),
209 VTW(0, 7),
210 VTW(0, 8),
211 VTW(0, 9),
212 VTW(0, 10),
213 VTW(0, 11),
214 VTW(0, 12),
215 VTW(0, 13),
216 VTW(0, 14),
217 {TW_NEXT, VL, 0}
218 };
219
220 static const ct_desc desc = { 15, XSIMD_STRING("t1fv_15"), twinstr, &GENUS, {50, 35, 42, 0}, 0, 0, 0 };
221
222 void XSIMD(codelet_t1fv_15) (planner *p) {
223 X(kdft_dit_register) (p, t1fv_15, &desc);
224 }
225 #else
226
227 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name t1fv_15 -include dft/simd/t1f.h */
228
229 /*
230 * This function contains 92 FP additions, 53 FP multiplications,
231 * (or, 78 additions, 39 multiplications, 14 fused multiply/add),
232 * 52 stack variables, 10 constants, and 30 memory accesses
233 */
234 #include "dft/simd/t1f.h"
235
236 static void t1fv_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
237 {
238 DVK(KP216506350, +0.216506350946109661690930792688234045867850657);
239 DVK(KP484122918, +0.484122918275927110647408174972799951354115213);
240 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
241 DVK(KP509036960, +0.509036960455127183450980863393907648510733164);
242 DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
243 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
244 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
245 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
246 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
247 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
248 {
249 INT m;
250 R *x;
251 x = ri;
252 for (m = mb, W = W + (mb * ((TWVL / VL) * 28)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 28), MAKE_VOLATILE_STRIDE(15, rs)) {
253 V T1e, T7, TP, T12, T15, Tf, Tn, To, T1b, T1c, T1f, TQ, TR, TS, Tw;
254 V TE, TF, TT, TU, TV;
255 {
256 V T1, T5, T3, T4, T2, T6;
257 T1 = LD(&(x[0]), ms, &(x[0]));
258 T4 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
259 T5 = BYTWJ(&(W[TWVL * 18]), T4);
260 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
261 T3 = BYTWJ(&(W[TWVL * 8]), T2);
262 T1e = VSUB(T5, T3);
263 T6 = VADD(T3, T5);
264 T7 = VADD(T1, T6);
265 TP = VFNMS(LDK(KP500000000), T6, T1);
266 }
267 {
268 V T9, Tq, Ty, Th, Te, T13, Tv, T10, TD, T11, Tm, T14;
269 {
270 V T8, Tp, Tx, Tg;
271 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
272 T9 = BYTWJ(&(W[TWVL * 4]), T8);
273 Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
274 Tq = BYTWJ(&(W[TWVL * 10]), Tp);
275 Tx = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
276 Ty = BYTWJ(&(W[TWVL * 16]), Tx);
277 Tg = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
278 Th = BYTWJ(&(W[TWVL * 22]), Tg);
279 }
280 {
281 V Tb, Td, Ta, Tc;
282 Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
283 Tb = BYTWJ(&(W[TWVL * 14]), Ta);
284 Tc = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
285 Td = BYTWJ(&(W[TWVL * 24]), Tc);
286 Te = VADD(Tb, Td);
287 T13 = VSUB(Td, Tb);
288 }
289 {
290 V Ts, Tu, Tr, Tt;
291 Tr = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
292 Ts = BYTWJ(&(W[TWVL * 20]), Tr);
293 Tt = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
294 Tu = BYTWJ(&(W[0]), Tt);
295 Tv = VADD(Ts, Tu);
296 T10 = VSUB(Tu, Ts);
297 }
298 {
299 V TA, TC, Tz, TB;
300 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
301 TA = BYTWJ(&(W[TWVL * 26]), Tz);
302 TB = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
303 TC = BYTWJ(&(W[TWVL * 6]), TB);
304 TD = VADD(TA, TC);
305 T11 = VSUB(TC, TA);
306 }
307 {
308 V Tj, Tl, Ti, Tk;
309 Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
310 Tj = BYTWJ(&(W[TWVL * 2]), Ti);
311 Tk = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
312 Tl = BYTWJ(&(W[TWVL * 12]), Tk);
313 Tm = VADD(Tj, Tl);
314 T14 = VSUB(Tl, Tj);
315 }
316 T12 = VSUB(T10, T11);
317 T15 = VSUB(T13, T14);
318 Tf = VADD(T9, Te);
319 Tn = VADD(Th, Tm);
320 To = VADD(Tf, Tn);
321 T1b = VADD(T13, T14);
322 T1c = VADD(T10, T11);
323 T1f = VADD(T1b, T1c);
324 TQ = VFNMS(LDK(KP500000000), Te, T9);
325 TR = VFNMS(LDK(KP500000000), Tm, Th);
326 TS = VADD(TQ, TR);
327 Tw = VADD(Tq, Tv);
328 TE = VADD(Ty, TD);
329 TF = VADD(Tw, TE);
330 TT = VFNMS(LDK(KP500000000), Tv, Tq);
331 TU = VFNMS(LDK(KP500000000), TD, Ty);
332 TV = VADD(TT, TU);
333 }
334 {
335 V TI, TG, TH, TM, TO, TK, TL, TN, TJ;
336 TI = VMUL(LDK(KP559016994), VSUB(To, TF));
337 TG = VADD(To, TF);
338 TH = VFNMS(LDK(KP250000000), TG, T7);
339 TK = VSUB(Tw, TE);
340 TL = VSUB(Tf, Tn);
341 TM = VBYI(VFNMS(LDK(KP587785252), TL, VMUL(LDK(KP951056516), TK)));
342 TO = VBYI(VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TK)));
343 ST(&(x[0]), VADD(T7, TG), ms, &(x[0]));
344 TN = VADD(TI, TH);
345 ST(&(x[WS(rs, 6)]), VSUB(TN, TO), ms, &(x[0]));
346 ST(&(x[WS(rs, 9)]), VADD(TO, TN), ms, &(x[WS(rs, 1)]));
347 TJ = VSUB(TH, TI);
348 ST(&(x[WS(rs, 3)]), VSUB(TJ, TM), ms, &(x[WS(rs, 1)]));
349 ST(&(x[WS(rs, 12)]), VADD(TM, TJ), ms, &(x[0]));
350 }
351 {
352 V T16, T1m, T1u, T1h, T1o, T1a, T1p, TZ, T1t, T1l, T1d, T1g;
353 T16 = VFNMS(LDK(KP509036960), T15, VMUL(LDK(KP823639103), T12));
354 T1m = VFMA(LDK(KP823639103), T15, VMUL(LDK(KP509036960), T12));
355 T1u = VBYI(VMUL(LDK(KP866025403), VADD(T1e, T1f)));
356 T1d = VMUL(LDK(KP484122918), VSUB(T1b, T1c));
357 T1g = VFNMS(LDK(KP216506350), T1f, VMUL(LDK(KP866025403), T1e));
358 T1h = VSUB(T1d, T1g);
359 T1o = VADD(T1d, T1g);
360 {
361 V T18, T19, TY, TW, TX;
362 T18 = VSUB(TT, TU);
363 T19 = VSUB(TQ, TR);
364 T1a = VFNMS(LDK(KP587785252), T19, VMUL(LDK(KP951056516), T18));
365 T1p = VFMA(LDK(KP951056516), T19, VMUL(LDK(KP587785252), T18));
366 TY = VMUL(LDK(KP559016994), VSUB(TS, TV));
367 TW = VADD(TS, TV);
368 TX = VFNMS(LDK(KP250000000), TW, TP);
369 TZ = VSUB(TX, TY);
370 T1t = VADD(TP, TW);
371 T1l = VADD(TY, TX);
372 }
373 {
374 V T17, T1i, T1r, T1s;
375 ST(&(x[WS(rs, 5)]), VSUB(T1t, T1u), ms, &(x[WS(rs, 1)]));
376 ST(&(x[WS(rs, 10)]), VADD(T1t, T1u), ms, &(x[0]));
377 T17 = VSUB(TZ, T16);
378 T1i = VBYI(VSUB(T1a, T1h));
379 ST(&(x[WS(rs, 8)]), VSUB(T17, T1i), ms, &(x[0]));
380 ST(&(x[WS(rs, 7)]), VADD(T17, T1i), ms, &(x[WS(rs, 1)]));
381 T1r = VSUB(T1l, T1m);
382 T1s = VBYI(VADD(T1p, T1o));
383 ST(&(x[WS(rs, 11)]), VSUB(T1r, T1s), ms, &(x[WS(rs, 1)]));
384 ST(&(x[WS(rs, 4)]), VADD(T1r, T1s), ms, &(x[0]));
385 {
386 V T1n, T1q, T1j, T1k;
387 T1n = VADD(T1l, T1m);
388 T1q = VBYI(VSUB(T1o, T1p));
389 ST(&(x[WS(rs, 14)]), VSUB(T1n, T1q), ms, &(x[0]));
390 ST(&(x[WS(rs, 1)]), VADD(T1n, T1q), ms, &(x[WS(rs, 1)]));
391 T1j = VADD(TZ, T16);
392 T1k = VBYI(VADD(T1a, T1h));
393 ST(&(x[WS(rs, 13)]), VSUB(T1j, T1k), ms, &(x[WS(rs, 1)]));
394 ST(&(x[WS(rs, 2)]), VADD(T1j, T1k), ms, &(x[0]));
395 }
396 }
397 }
398 }
399 }
400 VLEAVE();
401 }
402
403 static const tw_instr twinstr[] = {
404 VTW(0, 1),
405 VTW(0, 2),
406 VTW(0, 3),
407 VTW(0, 4),
408 VTW(0, 5),
409 VTW(0, 6),
410 VTW(0, 7),
411 VTW(0, 8),
412 VTW(0, 9),
413 VTW(0, 10),
414 VTW(0, 11),
415 VTW(0, 12),
416 VTW(0, 13),
417 VTW(0, 14),
418 {TW_NEXT, VL, 0}
419 };
420
421 static const ct_desc desc = { 15, XSIMD_STRING("t1fv_15"), twinstr, &GENUS, {78, 39, 14, 0}, 0, 0, 0 };
422
423 void XSIMD(codelet_t1fv_15) (planner *p) {
424 X(kdft_dit_register) (p, t1fv_15, &desc);
425 }
426 #endif