comparison src/fftw-3.3.8/dft/simd/common/t1fv_12.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:05:28 EDT 2018 */
23
24 #include "dft/codelet-dft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1fv_12 -include dft/simd/t1f.h */
29
30 /*
31 * This function contains 59 FP additions, 42 FP multiplications,
32 * (or, 41 additions, 24 multiplications, 18 fused multiply/add),
33 * 28 stack variables, 2 constants, and 24 memory accesses
34 */
35 #include "dft/simd/t1f.h"
36
37 static void t1fv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
41 {
42 INT m;
43 R *x;
44 x = ri;
45 for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
46 V T1, TC, T6, T7, Ty, Tq, Tz, TA, T9, TD, Te, Tf, Tu, Tl, Tv;
47 V Tw;
48 {
49 V T5, T3, T4, T2;
50 T1 = LD(&(x[0]), ms, &(x[0]));
51 T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
52 T5 = BYTWJ(&(W[TWVL * 14]), T4);
53 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
54 T3 = BYTWJ(&(W[TWVL * 6]), T2);
55 TC = VSUB(T5, T3);
56 T6 = VADD(T3, T5);
57 T7 = VFNMS(LDK(KP500000000), T6, T1);
58 }
59 {
60 V Tn, Tp, Tm, Tx, To;
61 Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
62 Tn = BYTWJ(&(W[0]), Tm);
63 Tx = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
64 Ty = BYTWJ(&(W[TWVL * 16]), Tx);
65 To = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
66 Tp = BYTWJ(&(W[TWVL * 8]), To);
67 Tq = VSUB(Tn, Tp);
68 Tz = VADD(Tn, Tp);
69 TA = VFNMS(LDK(KP500000000), Tz, Ty);
70 }
71 {
72 V Td, Tb, T8, Tc, Ta;
73 T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
74 T9 = BYTWJ(&(W[TWVL * 10]), T8);
75 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
76 Td = BYTWJ(&(W[TWVL * 2]), Tc);
77 Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
78 Tb = BYTWJ(&(W[TWVL * 18]), Ta);
79 TD = VSUB(Td, Tb);
80 Te = VADD(Tb, Td);
81 Tf = VFNMS(LDK(KP500000000), Te, T9);
82 }
83 {
84 V Ti, Tk, Th, Tt, Tj;
85 Th = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
86 Ti = BYTWJ(&(W[TWVL * 20]), Th);
87 Tt = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
88 Tu = BYTWJ(&(W[TWVL * 4]), Tt);
89 Tj = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
90 Tk = BYTWJ(&(W[TWVL * 12]), Tj);
91 Tl = VSUB(Ti, Tk);
92 Tv = VADD(Tk, Ti);
93 Tw = VFNMS(LDK(KP500000000), Tv, Tu);
94 }
95 {
96 V Ts, TG, TF, TH;
97 {
98 V Tg, Tr, TB, TE;
99 Tg = VSUB(T7, Tf);
100 Tr = VADD(Tl, Tq);
101 Ts = VFMA(LDK(KP866025403), Tr, Tg);
102 TG = VFNMS(LDK(KP866025403), Tr, Tg);
103 TB = VSUB(Tw, TA);
104 TE = VSUB(TC, TD);
105 TF = VFNMS(LDK(KP866025403), TE, TB);
106 TH = VFMA(LDK(KP866025403), TE, TB);
107 }
108 ST(&(x[WS(rs, 1)]), VFNMSI(TF, Ts), ms, &(x[WS(rs, 1)]));
109 ST(&(x[WS(rs, 7)]), VFMAI(TH, TG), ms, &(x[WS(rs, 1)]));
110 ST(&(x[WS(rs, 11)]), VFMAI(TF, Ts), ms, &(x[WS(rs, 1)]));
111 ST(&(x[WS(rs, 5)]), VFNMSI(TH, TG), ms, &(x[WS(rs, 1)]));
112 }
113 {
114 V TS, TW, TV, TX;
115 {
116 V TQ, TR, TT, TU;
117 TQ = VADD(T1, T6);
118 TR = VADD(T9, Te);
119 TS = VSUB(TQ, TR);
120 TW = VADD(TQ, TR);
121 TT = VADD(Tu, Tv);
122 TU = VADD(Ty, Tz);
123 TV = VSUB(TT, TU);
124 TX = VADD(TT, TU);
125 }
126 ST(&(x[WS(rs, 9)]), VFNMSI(TV, TS), ms, &(x[WS(rs, 1)]));
127 ST(&(x[0]), VADD(TW, TX), ms, &(x[0]));
128 ST(&(x[WS(rs, 3)]), VFMAI(TV, TS), ms, &(x[WS(rs, 1)]));
129 ST(&(x[WS(rs, 6)]), VSUB(TW, TX), ms, &(x[0]));
130 }
131 {
132 V TK, TO, TN, TP;
133 {
134 V TI, TJ, TL, TM;
135 TI = VADD(T7, Tf);
136 TJ = VADD(Tw, TA);
137 TK = VSUB(TI, TJ);
138 TO = VADD(TI, TJ);
139 TL = VSUB(Tl, Tq);
140 TM = VADD(TC, TD);
141 TN = VMUL(LDK(KP866025403), VSUB(TL, TM));
142 TP = VMUL(LDK(KP866025403), VADD(TM, TL));
143 }
144 ST(&(x[WS(rs, 2)]), VFMAI(TN, TK), ms, &(x[0]));
145 ST(&(x[WS(rs, 8)]), VFNMSI(TP, TO), ms, &(x[0]));
146 ST(&(x[WS(rs, 10)]), VFNMSI(TN, TK), ms, &(x[0]));
147 ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
148 }
149 }
150 }
151 VLEAVE();
152 }
153
154 static const tw_instr twinstr[] = {
155 VTW(0, 1),
156 VTW(0, 2),
157 VTW(0, 3),
158 VTW(0, 4),
159 VTW(0, 5),
160 VTW(0, 6),
161 VTW(0, 7),
162 VTW(0, 8),
163 VTW(0, 9),
164 VTW(0, 10),
165 VTW(0, 11),
166 {TW_NEXT, VL, 0}
167 };
168
169 static const ct_desc desc = { 12, XSIMD_STRING("t1fv_12"), twinstr, &GENUS, {41, 24, 18, 0}, 0, 0, 0 };
170
171 void XSIMD(codelet_t1fv_12) (planner *p) {
172 X(kdft_dit_register) (p, t1fv_12, &desc);
173 }
174 #else
175
176 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1fv_12 -include dft/simd/t1f.h */
177
178 /*
179 * This function contains 59 FP additions, 30 FP multiplications,
180 * (or, 55 additions, 26 multiplications, 4 fused multiply/add),
181 * 28 stack variables, 2 constants, and 24 memory accesses
182 */
183 #include "dft/simd/t1f.h"
184
185 static void t1fv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
186 {
187 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
188 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
189 {
190 INT m;
191 R *x;
192 x = ri;
193 for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
194 V T1, TH, T6, TA, Tq, TE, Tv, TL, T9, TI, Te, TB, Ti, TD, Tn;
195 V TK;
196 {
197 V T5, T3, T4, T2;
198 T1 = LD(&(x[0]), ms, &(x[0]));
199 T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
200 T5 = BYTWJ(&(W[TWVL * 14]), T4);
201 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
202 T3 = BYTWJ(&(W[TWVL * 6]), T2);
203 TH = VSUB(T5, T3);
204 T6 = VADD(T3, T5);
205 TA = VFNMS(LDK(KP500000000), T6, T1);
206 }
207 {
208 V Tu, Ts, Tp, Tt, Tr;
209 Tp = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
210 Tq = BYTWJ(&(W[TWVL * 16]), Tp);
211 Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
212 Tu = BYTWJ(&(W[TWVL * 8]), Tt);
213 Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
214 Ts = BYTWJ(&(W[0]), Tr);
215 TE = VSUB(Tu, Ts);
216 Tv = VADD(Ts, Tu);
217 TL = VFNMS(LDK(KP500000000), Tv, Tq);
218 }
219 {
220 V Td, Tb, T8, Tc, Ta;
221 T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
222 T9 = BYTWJ(&(W[TWVL * 10]), T8);
223 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
224 Td = BYTWJ(&(W[TWVL * 2]), Tc);
225 Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
226 Tb = BYTWJ(&(W[TWVL * 18]), Ta);
227 TI = VSUB(Td, Tb);
228 Te = VADD(Tb, Td);
229 TB = VFNMS(LDK(KP500000000), Te, T9);
230 }
231 {
232 V Tm, Tk, Th, Tl, Tj;
233 Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
234 Ti = BYTWJ(&(W[TWVL * 4]), Th);
235 Tl = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
236 Tm = BYTWJ(&(W[TWVL * 20]), Tl);
237 Tj = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
238 Tk = BYTWJ(&(W[TWVL * 12]), Tj);
239 TD = VSUB(Tm, Tk);
240 Tn = VADD(Tk, Tm);
241 TK = VFNMS(LDK(KP500000000), Tn, Ti);
242 }
243 {
244 V Tg, Ty, Tx, Tz;
245 {
246 V T7, Tf, To, Tw;
247 T7 = VADD(T1, T6);
248 Tf = VADD(T9, Te);
249 Tg = VSUB(T7, Tf);
250 Ty = VADD(T7, Tf);
251 To = VADD(Ti, Tn);
252 Tw = VADD(Tq, Tv);
253 Tx = VBYI(VSUB(To, Tw));
254 Tz = VADD(To, Tw);
255 }
256 ST(&(x[WS(rs, 9)]), VSUB(Tg, Tx), ms, &(x[WS(rs, 1)]));
257 ST(&(x[0]), VADD(Ty, Tz), ms, &(x[0]));
258 ST(&(x[WS(rs, 3)]), VADD(Tg, Tx), ms, &(x[WS(rs, 1)]));
259 ST(&(x[WS(rs, 6)]), VSUB(Ty, Tz), ms, &(x[0]));
260 }
261 {
262 V TS, TW, TV, TX;
263 {
264 V TQ, TR, TT, TU;
265 TQ = VADD(TA, TB);
266 TR = VADD(TK, TL);
267 TS = VSUB(TQ, TR);
268 TW = VADD(TQ, TR);
269 TT = VADD(TD, TE);
270 TU = VADD(TH, TI);
271 TV = VBYI(VMUL(LDK(KP866025403), VSUB(TT, TU)));
272 TX = VBYI(VMUL(LDK(KP866025403), VADD(TU, TT)));
273 }
274 ST(&(x[WS(rs, 10)]), VSUB(TS, TV), ms, &(x[0]));
275 ST(&(x[WS(rs, 4)]), VADD(TW, TX), ms, &(x[0]));
276 ST(&(x[WS(rs, 2)]), VADD(TS, TV), ms, &(x[0]));
277 ST(&(x[WS(rs, 8)]), VSUB(TW, TX), ms, &(x[0]));
278 }
279 {
280 V TG, TP, TN, TO;
281 {
282 V TC, TF, TJ, TM;
283 TC = VSUB(TA, TB);
284 TF = VMUL(LDK(KP866025403), VSUB(TD, TE));
285 TG = VSUB(TC, TF);
286 TP = VADD(TC, TF);
287 TJ = VMUL(LDK(KP866025403), VSUB(TH, TI));
288 TM = VSUB(TK, TL);
289 TN = VBYI(VADD(TJ, TM));
290 TO = VBYI(VSUB(TJ, TM));
291 }
292 ST(&(x[WS(rs, 5)]), VSUB(TG, TN), ms, &(x[WS(rs, 1)]));
293 ST(&(x[WS(rs, 11)]), VSUB(TP, TO), ms, &(x[WS(rs, 1)]));
294 ST(&(x[WS(rs, 7)]), VADD(TN, TG), ms, &(x[WS(rs, 1)]));
295 ST(&(x[WS(rs, 1)]), VADD(TO, TP), ms, &(x[WS(rs, 1)]));
296 }
297 }
298 }
299 VLEAVE();
300 }
301
302 static const tw_instr twinstr[] = {
303 VTW(0, 1),
304 VTW(0, 2),
305 VTW(0, 3),
306 VTW(0, 4),
307 VTW(0, 5),
308 VTW(0, 6),
309 VTW(0, 7),
310 VTW(0, 8),
311 VTW(0, 9),
312 VTW(0, 10),
313 VTW(0, 11),
314 {TW_NEXT, VL, 0}
315 };
316
317 static const ct_desc desc = { 12, XSIMD_STRING("t1fv_12"), twinstr, &GENUS, {55, 26, 4, 0}, 0, 0, 0 };
318
319 void XSIMD(codelet_t1fv_12) (planner *p) {
320 X(kdft_dit_register) (p, t1fv_12, &desc);
321 }
322 #endif