comparison src/fftw-3.3.8/dft/simd/common/t1bv_12.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:05:58 EDT 2018 */
23
24 #include "dft/codelet-dft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1bv_12 -include dft/simd/t1b.h -sign 1 */
29
30 /*
31 * This function contains 59 FP additions, 42 FP multiplications,
32 * (or, 41 additions, 24 multiplications, 18 fused multiply/add),
33 * 28 stack variables, 2 constants, and 24 memory accesses
34 */
35 #include "dft/simd/t1b.h"
36
37 static void t1bv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
41 {
42 INT m;
43 R *x;
44 x = ii;
45 for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
46 V T1, TK, T6, TA, Tq, TI, Tv, TE, T9, TL, Te, TB, Ti, TH, Tn;
47 V TD;
48 {
49 V T5, T3, T4, T2;
50 T1 = LD(&(x[0]), ms, &(x[0]));
51 T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
52 T5 = BYTW(&(W[TWVL * 14]), T4);
53 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
54 T3 = BYTW(&(W[TWVL * 6]), T2);
55 TK = VSUB(T3, T5);
56 T6 = VADD(T3, T5);
57 TA = VFNMS(LDK(KP500000000), T6, T1);
58 }
59 {
60 V Tu, Ts, Tp, Tt, Tr;
61 Tp = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
62 Tq = BYTW(&(W[TWVL * 16]), Tp);
63 Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
64 Tu = BYTW(&(W[TWVL * 8]), Tt);
65 Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
66 Ts = BYTW(&(W[0]), Tr);
67 TI = VSUB(Tu, Ts);
68 Tv = VADD(Ts, Tu);
69 TE = VFNMS(LDK(KP500000000), Tv, Tq);
70 }
71 {
72 V Td, Tb, T8, Tc, Ta;
73 T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
74 T9 = BYTW(&(W[TWVL * 10]), T8);
75 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
76 Td = BYTW(&(W[TWVL * 2]), Tc);
77 Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
78 Tb = BYTW(&(W[TWVL * 18]), Ta);
79 TL = VSUB(Tb, Td);
80 Te = VADD(Tb, Td);
81 TB = VFNMS(LDK(KP500000000), Te, T9);
82 }
83 {
84 V Tm, Tk, Th, Tl, Tj;
85 Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
86 Ti = BYTW(&(W[TWVL * 4]), Th);
87 Tl = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
88 Tm = BYTW(&(W[TWVL * 20]), Tl);
89 Tj = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
90 Tk = BYTW(&(W[TWVL * 12]), Tj);
91 TH = VSUB(Tk, Tm);
92 Tn = VADD(Tk, Tm);
93 TD = VFNMS(LDK(KP500000000), Tn, Ti);
94 }
95 {
96 V Tg, Ty, Tx, Tz;
97 {
98 V T7, Tf, To, Tw;
99 T7 = VADD(T1, T6);
100 Tf = VADD(T9, Te);
101 Tg = VSUB(T7, Tf);
102 Ty = VADD(T7, Tf);
103 To = VADD(Ti, Tn);
104 Tw = VADD(Tq, Tv);
105 Tx = VSUB(To, Tw);
106 Tz = VADD(To, Tw);
107 }
108 ST(&(x[WS(rs, 3)]), VFNMSI(Tx, Tg), ms, &(x[WS(rs, 1)]));
109 ST(&(x[0]), VADD(Ty, Tz), ms, &(x[0]));
110 ST(&(x[WS(rs, 9)]), VFMAI(Tx, Tg), ms, &(x[WS(rs, 1)]));
111 ST(&(x[WS(rs, 6)]), VSUB(Ty, Tz), ms, &(x[0]));
112 }
113 {
114 V TS, TW, TV, TX;
115 {
116 V TQ, TR, TT, TU;
117 TQ = VSUB(TA, TB);
118 TR = VADD(TH, TI);
119 TS = VFNMS(LDK(KP866025403), TR, TQ);
120 TW = VFMA(LDK(KP866025403), TR, TQ);
121 TT = VSUB(TD, TE);
122 TU = VSUB(TK, TL);
123 TV = VFMA(LDK(KP866025403), TU, TT);
124 TX = VFNMS(LDK(KP866025403), TU, TT);
125 }
126 ST(&(x[WS(rs, 1)]), VFMAI(TV, TS), ms, &(x[WS(rs, 1)]));
127 ST(&(x[WS(rs, 7)]), VFNMSI(TX, TW), ms, &(x[WS(rs, 1)]));
128 ST(&(x[WS(rs, 11)]), VFNMSI(TV, TS), ms, &(x[WS(rs, 1)]));
129 ST(&(x[WS(rs, 5)]), VFMAI(TX, TW), ms, &(x[WS(rs, 1)]));
130 }
131 {
132 V TG, TO, TN, TP;
133 {
134 V TC, TF, TJ, TM;
135 TC = VADD(TA, TB);
136 TF = VADD(TD, TE);
137 TG = VSUB(TC, TF);
138 TO = VADD(TC, TF);
139 TJ = VSUB(TH, TI);
140 TM = VADD(TK, TL);
141 TN = VMUL(LDK(KP866025403), VSUB(TJ, TM));
142 TP = VMUL(LDK(KP866025403), VADD(TM, TJ));
143 }
144 ST(&(x[WS(rs, 10)]), VFNMSI(TN, TG), ms, &(x[0]));
145 ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
146 ST(&(x[WS(rs, 2)]), VFMAI(TN, TG), ms, &(x[0]));
147 ST(&(x[WS(rs, 8)]), VFNMSI(TP, TO), ms, &(x[0]));
148 }
149 }
150 }
151 VLEAVE();
152 }
153
154 static const tw_instr twinstr[] = {
155 VTW(0, 1),
156 VTW(0, 2),
157 VTW(0, 3),
158 VTW(0, 4),
159 VTW(0, 5),
160 VTW(0, 6),
161 VTW(0, 7),
162 VTW(0, 8),
163 VTW(0, 9),
164 VTW(0, 10),
165 VTW(0, 11),
166 {TW_NEXT, VL, 0}
167 };
168
169 static const ct_desc desc = { 12, XSIMD_STRING("t1bv_12"), twinstr, &GENUS, {41, 24, 18, 0}, 0, 0, 0 };
170
171 void XSIMD(codelet_t1bv_12) (planner *p) {
172 X(kdft_dit_register) (p, t1bv_12, &desc);
173 }
174 #else
175
176 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1bv_12 -include dft/simd/t1b.h -sign 1 */
177
178 /*
179 * This function contains 59 FP additions, 30 FP multiplications,
180 * (or, 55 additions, 26 multiplications, 4 fused multiply/add),
181 * 28 stack variables, 2 constants, and 24 memory accesses
182 */
183 #include "dft/simd/t1b.h"
184
185 static void t1bv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
186 {
187 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
188 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
189 {
190 INT m;
191 R *x;
192 x = ii;
193 for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
194 V T1, Tt, T6, T7, TB, Tq, TC, TD, T9, Tu, Te, Tf, Tx, Tl, Ty;
195 V Tz;
196 {
197 V T5, T3, T4, T2;
198 T1 = LD(&(x[0]), ms, &(x[0]));
199 T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
200 T5 = BYTW(&(W[TWVL * 14]), T4);
201 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
202 T3 = BYTW(&(W[TWVL * 6]), T2);
203 Tt = VSUB(T3, T5);
204 T6 = VADD(T3, T5);
205 T7 = VFNMS(LDK(KP500000000), T6, T1);
206 }
207 {
208 V Tn, Tp, Tm, TA, To;
209 Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
210 Tn = BYTW(&(W[0]), Tm);
211 TA = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
212 TB = BYTW(&(W[TWVL * 16]), TA);
213 To = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
214 Tp = BYTW(&(W[TWVL * 8]), To);
215 Tq = VSUB(Tn, Tp);
216 TC = VADD(Tn, Tp);
217 TD = VFNMS(LDK(KP500000000), TC, TB);
218 }
219 {
220 V Td, Tb, T8, Tc, Ta;
221 T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
222 T9 = BYTW(&(W[TWVL * 10]), T8);
223 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
224 Td = BYTW(&(W[TWVL * 2]), Tc);
225 Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
226 Tb = BYTW(&(W[TWVL * 18]), Ta);
227 Tu = VSUB(Tb, Td);
228 Te = VADD(Tb, Td);
229 Tf = VFNMS(LDK(KP500000000), Te, T9);
230 }
231 {
232 V Ti, Tk, Th, Tw, Tj;
233 Th = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
234 Ti = BYTW(&(W[TWVL * 12]), Th);
235 Tw = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
236 Tx = BYTW(&(W[TWVL * 4]), Tw);
237 Tj = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
238 Tk = BYTW(&(W[TWVL * 20]), Tj);
239 Tl = VSUB(Ti, Tk);
240 Ty = VADD(Ti, Tk);
241 Tz = VFNMS(LDK(KP500000000), Ty, Tx);
242 }
243 {
244 V Ts, TG, TF, TH;
245 {
246 V Tg, Tr, Tv, TE;
247 Tg = VSUB(T7, Tf);
248 Tr = VMUL(LDK(KP866025403), VSUB(Tl, Tq));
249 Ts = VSUB(Tg, Tr);
250 TG = VADD(Tg, Tr);
251 Tv = VMUL(LDK(KP866025403), VSUB(Tt, Tu));
252 TE = VSUB(Tz, TD);
253 TF = VBYI(VADD(Tv, TE));
254 TH = VBYI(VSUB(TE, Tv));
255 }
256 ST(&(x[WS(rs, 11)]), VSUB(Ts, TF), ms, &(x[WS(rs, 1)]));
257 ST(&(x[WS(rs, 5)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
258 ST(&(x[WS(rs, 1)]), VADD(Ts, TF), ms, &(x[WS(rs, 1)]));
259 ST(&(x[WS(rs, 7)]), VSUB(TG, TH), ms, &(x[WS(rs, 1)]));
260 }
261 {
262 V TS, TW, TV, TX;
263 {
264 V TQ, TR, TT, TU;
265 TQ = VADD(T1, T6);
266 TR = VADD(T9, Te);
267 TS = VSUB(TQ, TR);
268 TW = VADD(TQ, TR);
269 TT = VADD(Tx, Ty);
270 TU = VADD(TB, TC);
271 TV = VBYI(VSUB(TT, TU));
272 TX = VADD(TT, TU);
273 }
274 ST(&(x[WS(rs, 3)]), VSUB(TS, TV), ms, &(x[WS(rs, 1)]));
275 ST(&(x[0]), VADD(TW, TX), ms, &(x[0]));
276 ST(&(x[WS(rs, 9)]), VADD(TS, TV), ms, &(x[WS(rs, 1)]));
277 ST(&(x[WS(rs, 6)]), VSUB(TW, TX), ms, &(x[0]));
278 }
279 {
280 V TK, TO, TN, TP;
281 {
282 V TI, TJ, TL, TM;
283 TI = VADD(Tl, Tq);
284 TJ = VADD(Tt, Tu);
285 TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
286 TO = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
287 TL = VADD(T7, Tf);
288 TM = VADD(Tz, TD);
289 TN = VSUB(TL, TM);
290 TP = VADD(TL, TM);
291 }
292 ST(&(x[WS(rs, 2)]), VADD(TK, TN), ms, &(x[0]));
293 ST(&(x[WS(rs, 8)]), VSUB(TP, TO), ms, &(x[0]));
294 ST(&(x[WS(rs, 10)]), VSUB(TN, TK), ms, &(x[0]));
295 ST(&(x[WS(rs, 4)]), VADD(TO, TP), ms, &(x[0]));
296 }
297 }
298 }
299 VLEAVE();
300 }
301
302 static const tw_instr twinstr[] = {
303 VTW(0, 1),
304 VTW(0, 2),
305 VTW(0, 3),
306 VTW(0, 4),
307 VTW(0, 5),
308 VTW(0, 6),
309 VTW(0, 7),
310 VTW(0, 8),
311 VTW(0, 9),
312 VTW(0, 10),
313 VTW(0, 11),
314 {TW_NEXT, VL, 0}
315 };
316
317 static const ct_desc desc = { 12, XSIMD_STRING("t1bv_12"), twinstr, &GENUS, {55, 26, 4, 0}, 0, 0, 0 };
318
319 void XSIMD(codelet_t1bv_12) (planner *p) {
320 X(kdft_dit_register) (p, t1bv_12, &desc);
321 }
322 #endif