Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.8/dft/scalar/codelets/t1_10.c @ 167:bd3cc4d1df30
Add FFTW 3.3.8 source, and a Linux build
author | Chris Cannam <cannam@all-day-breakfast.com> |
---|---|
date | Tue, 19 Nov 2019 14:52:55 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
166:cbd6d7e562c7 | 167:bd3cc4d1df30 |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-14 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 /* This file was automatically generated --- DO NOT EDIT */ | |
22 /* Generated on Thu May 24 08:04:14 EDT 2018 */ | |
23 | |
24 #include "dft/codelet-dft.h" | |
25 | |
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) | |
27 | |
28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */ | |
29 | |
30 /* | |
31 * This function contains 102 FP additions, 72 FP multiplications, | |
32 * (or, 48 additions, 18 multiplications, 54 fused multiply/add), | |
33 * 47 stack variables, 4 constants, and 40 memory accesses | |
34 */ | |
35 #include "dft/scalar/t.h" | |
36 | |
37 static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) | |
38 { | |
39 DK(KP951056516, +0.951056516295153572116439333379382143405698634); | |
40 DK(KP559016994, +0.559016994374947424102293417182819058860154590); | |
41 DK(KP618033988, +0.618033988749894848204586834365638117720309180); | |
42 DK(KP250000000, +0.250000000000000000000000000000000000000000000); | |
43 { | |
44 INT m; | |
45 for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) { | |
46 E T8, T23, T12, T1U, TM, TZ, T10, T1F, T1G, T1P, T16, T17, T18, T1s, T1x; | |
47 E T25, Tl, Ty, Tz, T1I, T1J, T1O, T13, T14, T15, T1h, T1m, T24; | |
48 { | |
49 E T1, T1T, T3, T6, T4, T1R, T2, T7, T1S, T5; | |
50 T1 = ri[0]; | |
51 T1T = ii[0]; | |
52 T3 = ri[WS(rs, 5)]; | |
53 T6 = ii[WS(rs, 5)]; | |
54 T2 = W[8]; | |
55 T4 = T2 * T3; | |
56 T1R = T2 * T6; | |
57 T5 = W[9]; | |
58 T7 = FMA(T5, T6, T4); | |
59 T1S = FNMS(T5, T3, T1R); | |
60 T8 = T1 - T7; | |
61 T23 = T1T - T1S; | |
62 T12 = T1 + T7; | |
63 T1U = T1S + T1T; | |
64 } | |
65 { | |
66 E TF, T1p, TY, T1w, TL, T1r, TS, T1u; | |
67 { | |
68 E TB, TE, TC, T1o, TA, TD; | |
69 TB = ri[WS(rs, 4)]; | |
70 TE = ii[WS(rs, 4)]; | |
71 TA = W[6]; | |
72 TC = TA * TB; | |
73 T1o = TA * TE; | |
74 TD = W[7]; | |
75 TF = FMA(TD, TE, TC); | |
76 T1p = FNMS(TD, TB, T1o); | |
77 } | |
78 { | |
79 E TU, TX, TV, T1v, TT, TW; | |
80 TU = ri[WS(rs, 1)]; | |
81 TX = ii[WS(rs, 1)]; | |
82 TT = W[0]; | |
83 TV = TT * TU; | |
84 T1v = TT * TX; | |
85 TW = W[1]; | |
86 TY = FMA(TW, TX, TV); | |
87 T1w = FNMS(TW, TU, T1v); | |
88 } | |
89 { | |
90 E TH, TK, TI, T1q, TG, TJ; | |
91 TH = ri[WS(rs, 9)]; | |
92 TK = ii[WS(rs, 9)]; | |
93 TG = W[16]; | |
94 TI = TG * TH; | |
95 T1q = TG * TK; | |
96 TJ = W[17]; | |
97 TL = FMA(TJ, TK, TI); | |
98 T1r = FNMS(TJ, TH, T1q); | |
99 } | |
100 { | |
101 E TO, TR, TP, T1t, TN, TQ; | |
102 TO = ri[WS(rs, 6)]; | |
103 TR = ii[WS(rs, 6)]; | |
104 TN = W[10]; | |
105 TP = TN * TO; | |
106 T1t = TN * TR; | |
107 TQ = W[11]; | |
108 TS = FMA(TQ, TR, TP); | |
109 T1u = FNMS(TQ, TO, T1t); | |
110 } | |
111 TM = TF - TL; | |
112 TZ = TS - TY; | |
113 T10 = TM + TZ; | |
114 T1F = T1p + T1r; | |
115 T1G = T1u + T1w; | |
116 T1P = T1F + T1G; | |
117 T16 = TF + TL; | |
118 T17 = TS + TY; | |
119 T18 = T16 + T17; | |
120 T1s = T1p - T1r; | |
121 T1x = T1u - T1w; | |
122 T25 = T1s + T1x; | |
123 } | |
124 { | |
125 E Te, T1e, Tx, T1l, Tk, T1g, Tr, T1j; | |
126 { | |
127 E Ta, Td, Tb, T1d, T9, Tc; | |
128 Ta = ri[WS(rs, 2)]; | |
129 Td = ii[WS(rs, 2)]; | |
130 T9 = W[2]; | |
131 Tb = T9 * Ta; | |
132 T1d = T9 * Td; | |
133 Tc = W[3]; | |
134 Te = FMA(Tc, Td, Tb); | |
135 T1e = FNMS(Tc, Ta, T1d); | |
136 } | |
137 { | |
138 E Tt, Tw, Tu, T1k, Ts, Tv; | |
139 Tt = ri[WS(rs, 3)]; | |
140 Tw = ii[WS(rs, 3)]; | |
141 Ts = W[4]; | |
142 Tu = Ts * Tt; | |
143 T1k = Ts * Tw; | |
144 Tv = W[5]; | |
145 Tx = FMA(Tv, Tw, Tu); | |
146 T1l = FNMS(Tv, Tt, T1k); | |
147 } | |
148 { | |
149 E Tg, Tj, Th, T1f, Tf, Ti; | |
150 Tg = ri[WS(rs, 7)]; | |
151 Tj = ii[WS(rs, 7)]; | |
152 Tf = W[12]; | |
153 Th = Tf * Tg; | |
154 T1f = Tf * Tj; | |
155 Ti = W[13]; | |
156 Tk = FMA(Ti, Tj, Th); | |
157 T1g = FNMS(Ti, Tg, T1f); | |
158 } | |
159 { | |
160 E Tn, Tq, To, T1i, Tm, Tp; | |
161 Tn = ri[WS(rs, 8)]; | |
162 Tq = ii[WS(rs, 8)]; | |
163 Tm = W[14]; | |
164 To = Tm * Tn; | |
165 T1i = Tm * Tq; | |
166 Tp = W[15]; | |
167 Tr = FMA(Tp, Tq, To); | |
168 T1j = FNMS(Tp, Tn, T1i); | |
169 } | |
170 Tl = Te - Tk; | |
171 Ty = Tr - Tx; | |
172 Tz = Tl + Ty; | |
173 T1I = T1e + T1g; | |
174 T1J = T1j + T1l; | |
175 T1O = T1I + T1J; | |
176 T13 = Te + Tk; | |
177 T14 = Tr + Tx; | |
178 T15 = T13 + T14; | |
179 T1h = T1e - T1g; | |
180 T1m = T1j - T1l; | |
181 T24 = T1h + T1m; | |
182 } | |
183 { | |
184 E T1b, T11, T1a, T1z, T1B, T1n, T1y, T1A, T1c; | |
185 T1b = Tz - T10; | |
186 T11 = Tz + T10; | |
187 T1a = FNMS(KP250000000, T11, T8); | |
188 T1n = T1h - T1m; | |
189 T1y = T1s - T1x; | |
190 T1z = FMA(KP618033988, T1y, T1n); | |
191 T1B = FNMS(KP618033988, T1n, T1y); | |
192 ri[WS(rs, 5)] = T8 + T11; | |
193 T1A = FNMS(KP559016994, T1b, T1a); | |
194 ri[WS(rs, 7)] = FNMS(KP951056516, T1B, T1A); | |
195 ri[WS(rs, 3)] = FMA(KP951056516, T1B, T1A); | |
196 T1c = FMA(KP559016994, T1b, T1a); | |
197 ri[WS(rs, 9)] = FNMS(KP951056516, T1z, T1c); | |
198 ri[WS(rs, 1)] = FMA(KP951056516, T1z, T1c); | |
199 } | |
200 { | |
201 E T28, T26, T27, T2c, T2e, T2a, T2b, T2d, T29; | |
202 T28 = T24 - T25; | |
203 T26 = T24 + T25; | |
204 T27 = FNMS(KP250000000, T26, T23); | |
205 T2a = Tl - Ty; | |
206 T2b = TM - TZ; | |
207 T2c = FMA(KP618033988, T2b, T2a); | |
208 T2e = FNMS(KP618033988, T2a, T2b); | |
209 ii[WS(rs, 5)] = T26 + T23; | |
210 T2d = FNMS(KP559016994, T28, T27); | |
211 ii[WS(rs, 3)] = FNMS(KP951056516, T2e, T2d); | |
212 ii[WS(rs, 7)] = FMA(KP951056516, T2e, T2d); | |
213 T29 = FMA(KP559016994, T28, T27); | |
214 ii[WS(rs, 1)] = FNMS(KP951056516, T2c, T29); | |
215 ii[WS(rs, 9)] = FMA(KP951056516, T2c, T29); | |
216 } | |
217 { | |
218 E T1D, T19, T1C, T1L, T1N, T1H, T1K, T1M, T1E; | |
219 T1D = T15 - T18; | |
220 T19 = T15 + T18; | |
221 T1C = FNMS(KP250000000, T19, T12); | |
222 T1H = T1F - T1G; | |
223 T1K = T1I - T1J; | |
224 T1L = FNMS(KP618033988, T1K, T1H); | |
225 T1N = FMA(KP618033988, T1H, T1K); | |
226 ri[0] = T12 + T19; | |
227 T1M = FMA(KP559016994, T1D, T1C); | |
228 ri[WS(rs, 4)] = FNMS(KP951056516, T1N, T1M); | |
229 ri[WS(rs, 6)] = FMA(KP951056516, T1N, T1M); | |
230 T1E = FNMS(KP559016994, T1D, T1C); | |
231 ri[WS(rs, 2)] = FNMS(KP951056516, T1L, T1E); | |
232 ri[WS(rs, 8)] = FMA(KP951056516, T1L, T1E); | |
233 } | |
234 { | |
235 E T1W, T1Q, T1V, T20, T22, T1Y, T1Z, T21, T1X; | |
236 T1W = T1O - T1P; | |
237 T1Q = T1O + T1P; | |
238 T1V = FNMS(KP250000000, T1Q, T1U); | |
239 T1Y = T16 - T17; | |
240 T1Z = T13 - T14; | |
241 T20 = FNMS(KP618033988, T1Z, T1Y); | |
242 T22 = FMA(KP618033988, T1Y, T1Z); | |
243 ii[0] = T1Q + T1U; | |
244 T21 = FMA(KP559016994, T1W, T1V); | |
245 ii[WS(rs, 4)] = FMA(KP951056516, T22, T21); | |
246 ii[WS(rs, 6)] = FNMS(KP951056516, T22, T21); | |
247 T1X = FNMS(KP559016994, T1W, T1V); | |
248 ii[WS(rs, 2)] = FMA(KP951056516, T20, T1X); | |
249 ii[WS(rs, 8)] = FNMS(KP951056516, T20, T1X); | |
250 } | |
251 } | |
252 } | |
253 } | |
254 | |
255 static const tw_instr twinstr[] = { | |
256 {TW_FULL, 0, 10}, | |
257 {TW_NEXT, 1, 0} | |
258 }; | |
259 | |
260 static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, {48, 18, 54, 0}, 0, 0, 0 }; | |
261 | |
262 void X(codelet_t1_10) (planner *p) { | |
263 X(kdft_dit_register) (p, t1_10, &desc); | |
264 } | |
265 #else | |
266 | |
267 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */ | |
268 | |
269 /* | |
270 * This function contains 102 FP additions, 60 FP multiplications, | |
271 * (or, 72 additions, 30 multiplications, 30 fused multiply/add), | |
272 * 45 stack variables, 4 constants, and 40 memory accesses | |
273 */ | |
274 #include "dft/scalar/t.h" | |
275 | |
276 static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) | |
277 { | |
278 DK(KP587785252, +0.587785252292473129168705954639072768597652438); | |
279 DK(KP951056516, +0.951056516295153572116439333379382143405698634); | |
280 DK(KP250000000, +0.250000000000000000000000000000000000000000000); | |
281 DK(KP559016994, +0.559016994374947424102293417182819058860154590); | |
282 { | |
283 INT m; | |
284 for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) { | |
285 E T7, T1O, TT, T1C, TF, TQ, TR, T1o, T1p, T1y, TX, TY, TZ, T1d, T1g; | |
286 E T1M, Ti, Tt, Tu, T1r, T1s, T1x, TU, TV, TW, T16, T19, T1L; | |
287 { | |
288 E T1, T1B, T6, T1A; | |
289 T1 = ri[0]; | |
290 T1B = ii[0]; | |
291 { | |
292 E T3, T5, T2, T4; | |
293 T3 = ri[WS(rs, 5)]; | |
294 T5 = ii[WS(rs, 5)]; | |
295 T2 = W[8]; | |
296 T4 = W[9]; | |
297 T6 = FMA(T2, T3, T4 * T5); | |
298 T1A = FNMS(T4, T3, T2 * T5); | |
299 } | |
300 T7 = T1 - T6; | |
301 T1O = T1B - T1A; | |
302 TT = T1 + T6; | |
303 T1C = T1A + T1B; | |
304 } | |
305 { | |
306 E Tz, T1b, TP, T1f, TE, T1c, TK, T1e; | |
307 { | |
308 E Tw, Ty, Tv, Tx; | |
309 Tw = ri[WS(rs, 4)]; | |
310 Ty = ii[WS(rs, 4)]; | |
311 Tv = W[6]; | |
312 Tx = W[7]; | |
313 Tz = FMA(Tv, Tw, Tx * Ty); | |
314 T1b = FNMS(Tx, Tw, Tv * Ty); | |
315 } | |
316 { | |
317 E TM, TO, TL, TN; | |
318 TM = ri[WS(rs, 1)]; | |
319 TO = ii[WS(rs, 1)]; | |
320 TL = W[0]; | |
321 TN = W[1]; | |
322 TP = FMA(TL, TM, TN * TO); | |
323 T1f = FNMS(TN, TM, TL * TO); | |
324 } | |
325 { | |
326 E TB, TD, TA, TC; | |
327 TB = ri[WS(rs, 9)]; | |
328 TD = ii[WS(rs, 9)]; | |
329 TA = W[16]; | |
330 TC = W[17]; | |
331 TE = FMA(TA, TB, TC * TD); | |
332 T1c = FNMS(TC, TB, TA * TD); | |
333 } | |
334 { | |
335 E TH, TJ, TG, TI; | |
336 TH = ri[WS(rs, 6)]; | |
337 TJ = ii[WS(rs, 6)]; | |
338 TG = W[10]; | |
339 TI = W[11]; | |
340 TK = FMA(TG, TH, TI * TJ); | |
341 T1e = FNMS(TI, TH, TG * TJ); | |
342 } | |
343 TF = Tz - TE; | |
344 TQ = TK - TP; | |
345 TR = TF + TQ; | |
346 T1o = T1b + T1c; | |
347 T1p = T1e + T1f; | |
348 T1y = T1o + T1p; | |
349 TX = Tz + TE; | |
350 TY = TK + TP; | |
351 TZ = TX + TY; | |
352 T1d = T1b - T1c; | |
353 T1g = T1e - T1f; | |
354 T1M = T1d + T1g; | |
355 } | |
356 { | |
357 E Tc, T14, Ts, T18, Th, T15, Tn, T17; | |
358 { | |
359 E T9, Tb, T8, Ta; | |
360 T9 = ri[WS(rs, 2)]; | |
361 Tb = ii[WS(rs, 2)]; | |
362 T8 = W[2]; | |
363 Ta = W[3]; | |
364 Tc = FMA(T8, T9, Ta * Tb); | |
365 T14 = FNMS(Ta, T9, T8 * Tb); | |
366 } | |
367 { | |
368 E Tp, Tr, To, Tq; | |
369 Tp = ri[WS(rs, 3)]; | |
370 Tr = ii[WS(rs, 3)]; | |
371 To = W[4]; | |
372 Tq = W[5]; | |
373 Ts = FMA(To, Tp, Tq * Tr); | |
374 T18 = FNMS(Tq, Tp, To * Tr); | |
375 } | |
376 { | |
377 E Te, Tg, Td, Tf; | |
378 Te = ri[WS(rs, 7)]; | |
379 Tg = ii[WS(rs, 7)]; | |
380 Td = W[12]; | |
381 Tf = W[13]; | |
382 Th = FMA(Td, Te, Tf * Tg); | |
383 T15 = FNMS(Tf, Te, Td * Tg); | |
384 } | |
385 { | |
386 E Tk, Tm, Tj, Tl; | |
387 Tk = ri[WS(rs, 8)]; | |
388 Tm = ii[WS(rs, 8)]; | |
389 Tj = W[14]; | |
390 Tl = W[15]; | |
391 Tn = FMA(Tj, Tk, Tl * Tm); | |
392 T17 = FNMS(Tl, Tk, Tj * Tm); | |
393 } | |
394 Ti = Tc - Th; | |
395 Tt = Tn - Ts; | |
396 Tu = Ti + Tt; | |
397 T1r = T14 + T15; | |
398 T1s = T17 + T18; | |
399 T1x = T1r + T1s; | |
400 TU = Tc + Th; | |
401 TV = Tn + Ts; | |
402 TW = TU + TV; | |
403 T16 = T14 - T15; | |
404 T19 = T17 - T18; | |
405 T1L = T16 + T19; | |
406 } | |
407 { | |
408 E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13; | |
409 T11 = KP559016994 * (Tu - TR); | |
410 TS = Tu + TR; | |
411 T12 = FNMS(KP250000000, TS, T7); | |
412 T1a = T16 - T19; | |
413 T1h = T1d - T1g; | |
414 T1i = FMA(KP951056516, T1a, KP587785252 * T1h); | |
415 T1k = FNMS(KP587785252, T1a, KP951056516 * T1h); | |
416 ri[WS(rs, 5)] = T7 + TS; | |
417 T1j = T12 - T11; | |
418 ri[WS(rs, 7)] = T1j - T1k; | |
419 ri[WS(rs, 3)] = T1j + T1k; | |
420 T13 = T11 + T12; | |
421 ri[WS(rs, 9)] = T13 - T1i; | |
422 ri[WS(rs, 1)] = T13 + T1i; | |
423 } | |
424 { | |
425 E T1N, T1P, T1Q, T1U, T1W, T1S, T1T, T1V, T1R; | |
426 T1N = KP559016994 * (T1L - T1M); | |
427 T1P = T1L + T1M; | |
428 T1Q = FNMS(KP250000000, T1P, T1O); | |
429 T1S = Ti - Tt; | |
430 T1T = TF - TQ; | |
431 T1U = FMA(KP951056516, T1S, KP587785252 * T1T); | |
432 T1W = FNMS(KP587785252, T1S, KP951056516 * T1T); | |
433 ii[WS(rs, 5)] = T1P + T1O; | |
434 T1V = T1Q - T1N; | |
435 ii[WS(rs, 3)] = T1V - T1W; | |
436 ii[WS(rs, 7)] = T1W + T1V; | |
437 T1R = T1N + T1Q; | |
438 ii[WS(rs, 1)] = T1R - T1U; | |
439 ii[WS(rs, 9)] = T1U + T1R; | |
440 } | |
441 { | |
442 E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n; | |
443 T1m = KP559016994 * (TW - TZ); | |
444 T10 = TW + TZ; | |
445 T1l = FNMS(KP250000000, T10, TT); | |
446 T1q = T1o - T1p; | |
447 T1t = T1r - T1s; | |
448 T1u = FNMS(KP587785252, T1t, KP951056516 * T1q); | |
449 T1w = FMA(KP951056516, T1t, KP587785252 * T1q); | |
450 ri[0] = TT + T10; | |
451 T1v = T1m + T1l; | |
452 ri[WS(rs, 4)] = T1v - T1w; | |
453 ri[WS(rs, 6)] = T1v + T1w; | |
454 T1n = T1l - T1m; | |
455 ri[WS(rs, 2)] = T1n - T1u; | |
456 ri[WS(rs, 8)] = T1n + T1u; | |
457 } | |
458 { | |
459 E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I; | |
460 T1H = KP559016994 * (T1x - T1y); | |
461 T1z = T1x + T1y; | |
462 T1G = FNMS(KP250000000, T1z, T1C); | |
463 T1D = TX - TY; | |
464 T1E = TU - TV; | |
465 T1F = FNMS(KP587785252, T1E, KP951056516 * T1D); | |
466 T1J = FMA(KP951056516, T1E, KP587785252 * T1D); | |
467 ii[0] = T1z + T1C; | |
468 T1K = T1H + T1G; | |
469 ii[WS(rs, 4)] = T1J + T1K; | |
470 ii[WS(rs, 6)] = T1K - T1J; | |
471 T1I = T1G - T1H; | |
472 ii[WS(rs, 2)] = T1F + T1I; | |
473 ii[WS(rs, 8)] = T1I - T1F; | |
474 } | |
475 } | |
476 } | |
477 } | |
478 | |
479 static const tw_instr twinstr[] = { | |
480 {TW_FULL, 0, 10}, | |
481 {TW_NEXT, 1, 0} | |
482 }; | |
483 | |
484 static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, {72, 30, 30, 0}, 0, 0, 0 }; | |
485 | |
486 void X(codelet_t1_10) (planner *p) { | |
487 X(kdft_dit_register) (p, t1_10, &desc); | |
488 } | |
489 #endif |