comparison src/fftw-3.3.8/dft/simd/common/q1fv_4.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:06:13 EDT 2018 */
23
24 #include "dft/codelet-dft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_twidsq_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1fv_4 -include dft/simd/q1f.h */
29
30 /*
31 * This function contains 44 FP additions, 32 FP multiplications,
32 * (or, 36 additions, 24 multiplications, 8 fused multiply/add),
33 * 22 stack variables, 0 constants, and 32 memory accesses
34 */
35 #include "dft/simd/q1f.h"
36
37 static void q1fv_4(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
38 {
39 {
40 INT m;
41 R *x;
42 x = ri;
43 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, vs)) {
44 V T3, T9, TA, TG, TD, TH, T6, Ta, Te, Tk, Tp, Tv, Ts, Tw, Th;
45 V Tl;
46 {
47 V T1, T2, Ty, Tz;
48 T1 = LD(&(x[0]), ms, &(x[0]));
49 T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
50 T3 = VSUB(T1, T2);
51 T9 = VADD(T1, T2);
52 Ty = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
53 Tz = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
54 TA = VSUB(Ty, Tz);
55 TG = VADD(Ty, Tz);
56 }
57 {
58 V TB, TC, T4, T5;
59 TB = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
60 TC = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
61 TD = VSUB(TB, TC);
62 TH = VADD(TB, TC);
63 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
64 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
65 T6 = VSUB(T4, T5);
66 Ta = VADD(T4, T5);
67 }
68 {
69 V Tc, Td, Tn, To;
70 Tc = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
71 Td = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
72 Te = VSUB(Tc, Td);
73 Tk = VADD(Tc, Td);
74 Tn = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
75 To = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
76 Tp = VSUB(Tn, To);
77 Tv = VADD(Tn, To);
78 }
79 {
80 V Tq, Tr, Tf, Tg;
81 Tq = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
82 Tr = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
83 Ts = VSUB(Tq, Tr);
84 Tw = VADD(Tq, Tr);
85 Tf = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
86 Tg = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
87 Th = VSUB(Tf, Tg);
88 Tl = VADD(Tf, Tg);
89 }
90 ST(&(x[0]), VADD(T9, Ta), ms, &(x[0]));
91 ST(&(x[WS(rs, 1)]), VADD(Tk, Tl), ms, &(x[WS(rs, 1)]));
92 ST(&(x[WS(rs, 2)]), VADD(Tv, Tw), ms, &(x[0]));
93 ST(&(x[WS(rs, 3)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
94 {
95 V T7, Ti, Tt, TE;
96 T7 = BYTWJ(&(W[0]), VFNMSI(T6, T3));
97 ST(&(x[WS(vs, 1)]), T7, ms, &(x[WS(vs, 1)]));
98 Ti = BYTWJ(&(W[0]), VFNMSI(Th, Te));
99 ST(&(x[WS(vs, 1) + WS(rs, 1)]), Ti, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
100 Tt = BYTWJ(&(W[0]), VFNMSI(Ts, Tp));
101 ST(&(x[WS(vs, 1) + WS(rs, 2)]), Tt, ms, &(x[WS(vs, 1)]));
102 TE = BYTWJ(&(W[0]), VFNMSI(TD, TA));
103 ST(&(x[WS(vs, 1) + WS(rs, 3)]), TE, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
104 }
105 {
106 V T8, Tj, Tu, TF;
107 T8 = BYTWJ(&(W[TWVL * 4]), VFMAI(T6, T3));
108 ST(&(x[WS(vs, 3)]), T8, ms, &(x[WS(vs, 3)]));
109 Tj = BYTWJ(&(W[TWVL * 4]), VFMAI(Th, Te));
110 ST(&(x[WS(vs, 3) + WS(rs, 1)]), Tj, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
111 Tu = BYTWJ(&(W[TWVL * 4]), VFMAI(Ts, Tp));
112 ST(&(x[WS(vs, 3) + WS(rs, 2)]), Tu, ms, &(x[WS(vs, 3)]));
113 TF = BYTWJ(&(W[TWVL * 4]), VFMAI(TD, TA));
114 ST(&(x[WS(vs, 3) + WS(rs, 3)]), TF, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
115 }
116 {
117 V Tb, Tm, Tx, TI;
118 Tb = BYTWJ(&(W[TWVL * 2]), VSUB(T9, Ta));
119 ST(&(x[WS(vs, 2)]), Tb, ms, &(x[WS(vs, 2)]));
120 Tm = BYTWJ(&(W[TWVL * 2]), VSUB(Tk, Tl));
121 ST(&(x[WS(vs, 2) + WS(rs, 1)]), Tm, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
122 Tx = BYTWJ(&(W[TWVL * 2]), VSUB(Tv, Tw));
123 ST(&(x[WS(vs, 2) + WS(rs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
124 TI = BYTWJ(&(W[TWVL * 2]), VSUB(TG, TH));
125 ST(&(x[WS(vs, 2) + WS(rs, 3)]), TI, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
126 }
127 }
128 }
129 VLEAVE();
130 }
131
132 static const tw_instr twinstr[] = {
133 VTW(0, 1),
134 VTW(0, 2),
135 VTW(0, 3),
136 {TW_NEXT, VL, 0}
137 };
138
139 static const ct_desc desc = { 4, XSIMD_STRING("q1fv_4"), twinstr, &GENUS, {36, 24, 8, 0}, 0, 0, 0 };
140
141 void XSIMD(codelet_q1fv_4) (planner *p) {
142 X(kdft_difsq_register) (p, q1fv_4, &desc);
143 }
144 #else
145
146 /* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1fv_4 -include dft/simd/q1f.h */
147
148 /*
149 * This function contains 44 FP additions, 24 FP multiplications,
150 * (or, 44 additions, 24 multiplications, 0 fused multiply/add),
151 * 22 stack variables, 0 constants, and 32 memory accesses
152 */
153 #include "dft/simd/q1f.h"
154
155 static void q1fv_4(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
156 {
157 {
158 INT m;
159 R *x;
160 x = ri;
161 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, vs)) {
162 V T3, T9, TA, TG, TD, TH, T6, Ta, Te, Tk, Tp, Tv, Ts, Tw, Th;
163 V Tl;
164 {
165 V T1, T2, Ty, Tz;
166 T1 = LD(&(x[0]), ms, &(x[0]));
167 T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
168 T3 = VSUB(T1, T2);
169 T9 = VADD(T1, T2);
170 Ty = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
171 Tz = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
172 TA = VSUB(Ty, Tz);
173 TG = VADD(Ty, Tz);
174 }
175 {
176 V TB, TC, T4, T5;
177 TB = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
178 TC = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
179 TD = VBYI(VSUB(TB, TC));
180 TH = VADD(TB, TC);
181 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
182 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
183 T6 = VBYI(VSUB(T4, T5));
184 Ta = VADD(T4, T5);
185 }
186 {
187 V Tc, Td, Tn, To;
188 Tc = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
189 Td = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
190 Te = VSUB(Tc, Td);
191 Tk = VADD(Tc, Td);
192 Tn = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
193 To = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
194 Tp = VSUB(Tn, To);
195 Tv = VADD(Tn, To);
196 }
197 {
198 V Tq, Tr, Tf, Tg;
199 Tq = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
200 Tr = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
201 Ts = VBYI(VSUB(Tq, Tr));
202 Tw = VADD(Tq, Tr);
203 Tf = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
204 Tg = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
205 Th = VBYI(VSUB(Tf, Tg));
206 Tl = VADD(Tf, Tg);
207 }
208 ST(&(x[0]), VADD(T9, Ta), ms, &(x[0]));
209 ST(&(x[WS(rs, 1)]), VADD(Tk, Tl), ms, &(x[WS(rs, 1)]));
210 ST(&(x[WS(rs, 2)]), VADD(Tv, Tw), ms, &(x[0]));
211 ST(&(x[WS(rs, 3)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
212 {
213 V T7, Ti, Tt, TE;
214 T7 = BYTWJ(&(W[0]), VSUB(T3, T6));
215 ST(&(x[WS(vs, 1)]), T7, ms, &(x[WS(vs, 1)]));
216 Ti = BYTWJ(&(W[0]), VSUB(Te, Th));
217 ST(&(x[WS(vs, 1) + WS(rs, 1)]), Ti, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
218 Tt = BYTWJ(&(W[0]), VSUB(Tp, Ts));
219 ST(&(x[WS(vs, 1) + WS(rs, 2)]), Tt, ms, &(x[WS(vs, 1)]));
220 TE = BYTWJ(&(W[0]), VSUB(TA, TD));
221 ST(&(x[WS(vs, 1) + WS(rs, 3)]), TE, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
222 }
223 {
224 V T8, Tj, Tu, TF;
225 T8 = BYTWJ(&(W[TWVL * 4]), VADD(T3, T6));
226 ST(&(x[WS(vs, 3)]), T8, ms, &(x[WS(vs, 3)]));
227 Tj = BYTWJ(&(W[TWVL * 4]), VADD(Te, Th));
228 ST(&(x[WS(vs, 3) + WS(rs, 1)]), Tj, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
229 Tu = BYTWJ(&(W[TWVL * 4]), VADD(Tp, Ts));
230 ST(&(x[WS(vs, 3) + WS(rs, 2)]), Tu, ms, &(x[WS(vs, 3)]));
231 TF = BYTWJ(&(W[TWVL * 4]), VADD(TA, TD));
232 ST(&(x[WS(vs, 3) + WS(rs, 3)]), TF, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
233 }
234 {
235 V Tb, Tm, Tx, TI;
236 Tb = BYTWJ(&(W[TWVL * 2]), VSUB(T9, Ta));
237 ST(&(x[WS(vs, 2)]), Tb, ms, &(x[WS(vs, 2)]));
238 Tm = BYTWJ(&(W[TWVL * 2]), VSUB(Tk, Tl));
239 ST(&(x[WS(vs, 2) + WS(rs, 1)]), Tm, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
240 Tx = BYTWJ(&(W[TWVL * 2]), VSUB(Tv, Tw));
241 ST(&(x[WS(vs, 2) + WS(rs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
242 TI = BYTWJ(&(W[TWVL * 2]), VSUB(TG, TH));
243 ST(&(x[WS(vs, 2) + WS(rs, 3)]), TI, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
244 }
245 }
246 }
247 VLEAVE();
248 }
249
250 static const tw_instr twinstr[] = {
251 VTW(0, 1),
252 VTW(0, 2),
253 VTW(0, 3),
254 {TW_NEXT, VL, 0}
255 };
256
257 static const ct_desc desc = { 4, XSIMD_STRING("q1fv_4"), twinstr, &GENUS, {44, 24, 0, 0}, 0, 0, 0 };
258
259 void XSIMD(codelet_q1fv_4) (planner *p) {
260 X(kdft_difsq_register) (p, q1fv_4, &desc);
261 }
262 #endif