comparison src/fftw-3.3.3/dft/simd/common/t2fv_16.c @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Sun Nov 25 07:38:35 EST 2012 */
23
24 #include "codelet-dft.h"
25
26 #ifdef HAVE_FMA
27
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t2fv_16 -include t2f.h */
29
30 /*
31 * This function contains 87 FP additions, 64 FP multiplications,
32 * (or, 53 additions, 30 multiplications, 34 fused multiply/add),
33 * 61 stack variables, 3 constants, and 32 memory accesses
34 */
35 #include "t2f.h"
36
37 static void t2fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
42 {
43 INT m;
44 R *x;
45 x = ri;
46 for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
47 V TO, Ta, TJ, TP, T14, Tq, T1i, T10, T1b, T1l, T13, T1c, TR, Tl, T15;
48 V Tv;
49 {
50 V Tc, TW, T4, T19, T9, TD, TI, Tj, TZ, T1a, Te, Th, Tn, Tr, Tu;
51 V Tp;
52 {
53 V T1, T2, T5, T7;
54 T1 = LD(&(x[0]), ms, &(x[0]));
55 T2 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
56 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
57 T7 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
58 {
59 V Tz, TG, TB, TE;
60 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
61 TG = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
62 TB = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
63 TE = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
64 {
65 V Ti, TY, TX, Td, Tg, Tm, Tt, To;
66 {
67 V T3, T6, T8, TA, TH, TC, TF, Tb;
68 Tb = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
69 T3 = BYTWJ(&(W[TWVL * 14]), T2);
70 T6 = BYTWJ(&(W[TWVL * 6]), T5);
71 T8 = BYTWJ(&(W[TWVL * 22]), T7);
72 TA = BYTWJ(&(W[TWVL * 26]), Tz);
73 TH = BYTWJ(&(W[TWVL * 18]), TG);
74 TC = BYTWJ(&(W[TWVL * 10]), TB);
75 TF = BYTWJ(&(W[TWVL * 2]), TE);
76 Tc = BYTWJ(&(W[0]), Tb);
77 TW = VSUB(T1, T3);
78 T4 = VADD(T1, T3);
79 T19 = VSUB(T6, T8);
80 T9 = VADD(T6, T8);
81 Ti = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
82 TD = VADD(TA, TC);
83 TY = VSUB(TA, TC);
84 TI = VADD(TF, TH);
85 TX = VSUB(TF, TH);
86 }
87 Td = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
88 Tg = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
89 Tm = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
90 Tj = BYTWJ(&(W[TWVL * 24]), Ti);
91 Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
92 To = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
93 TZ = VADD(TX, TY);
94 T1a = VSUB(TY, TX);
95 Te = BYTWJ(&(W[TWVL * 16]), Td);
96 Th = BYTWJ(&(W[TWVL * 8]), Tg);
97 Tn = BYTWJ(&(W[TWVL * 28]), Tm);
98 Tr = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
99 Tu = BYTWJ(&(W[TWVL * 20]), Tt);
100 Tp = BYTWJ(&(W[TWVL * 12]), To);
101 }
102 }
103 }
104 {
105 V Tf, T11, Tk, T12, Ts;
106 TO = VADD(T4, T9);
107 Ta = VSUB(T4, T9);
108 TJ = VSUB(TD, TI);
109 TP = VADD(TI, TD);
110 Tf = VADD(Tc, Te);
111 T11 = VSUB(Tc, Te);
112 Tk = VADD(Th, Tj);
113 T12 = VSUB(Th, Tj);
114 Ts = BYTWJ(&(W[TWVL * 4]), Tr);
115 T14 = VSUB(Tn, Tp);
116 Tq = VADD(Tn, Tp);
117 T1i = VFNMS(LDK(KP707106781), TZ, TW);
118 T10 = VFMA(LDK(KP707106781), TZ, TW);
119 T1b = VFNMS(LDK(KP707106781), T1a, T19);
120 T1l = VFMA(LDK(KP707106781), T1a, T19);
121 T13 = VFNMS(LDK(KP414213562), T12, T11);
122 T1c = VFMA(LDK(KP414213562), T11, T12);
123 TR = VADD(Tf, Tk);
124 Tl = VSUB(Tf, Tk);
125 T15 = VSUB(Tu, Ts);
126 Tv = VADD(Ts, Tu);
127 }
128 }
129 {
130 V T1d, T16, TS, Tw, TU, TQ;
131 T1d = VFMA(LDK(KP414213562), T14, T15);
132 T16 = VFNMS(LDK(KP414213562), T15, T14);
133 TS = VADD(Tq, Tv);
134 Tw = VSUB(Tq, Tv);
135 TU = VSUB(TO, TP);
136 TQ = VADD(TO, TP);
137 {
138 V T1e, T1j, T17, T1m;
139 T1e = VSUB(T1c, T1d);
140 T1j = VADD(T1c, T1d);
141 T17 = VADD(T13, T16);
142 T1m = VSUB(T16, T13);
143 {
144 V TV, TT, TK, Tx;
145 TV = VSUB(TS, TR);
146 TT = VADD(TR, TS);
147 TK = VSUB(Tw, Tl);
148 Tx = VADD(Tl, Tw);
149 {
150 V T1h, T1f, T1o, T1k;
151 T1h = VFMA(LDK(KP923879532), T1e, T1b);
152 T1f = VFNMS(LDK(KP923879532), T1e, T1b);
153 T1o = VFMA(LDK(KP923879532), T1j, T1i);
154 T1k = VFNMS(LDK(KP923879532), T1j, T1i);
155 {
156 V T1g, T18, T1p, T1n;
157 T1g = VFMA(LDK(KP923879532), T17, T10);
158 T18 = VFNMS(LDK(KP923879532), T17, T10);
159 T1p = VFMA(LDK(KP923879532), T1m, T1l);
160 T1n = VFNMS(LDK(KP923879532), T1m, T1l);
161 ST(&(x[WS(rs, 12)]), VFNMSI(TV, TU), ms, &(x[0]));
162 ST(&(x[WS(rs, 4)]), VFMAI(TV, TU), ms, &(x[0]));
163 ST(&(x[0]), VADD(TQ, TT), ms, &(x[0]));
164 ST(&(x[WS(rs, 8)]), VSUB(TQ, TT), ms, &(x[0]));
165 {
166 V TN, TL, TM, Ty;
167 TN = VFMA(LDK(KP707106781), TK, TJ);
168 TL = VFNMS(LDK(KP707106781), TK, TJ);
169 TM = VFMA(LDK(KP707106781), Tx, Ta);
170 Ty = VFNMS(LDK(KP707106781), Tx, Ta);
171 ST(&(x[WS(rs, 1)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
172 ST(&(x[WS(rs, 15)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
173 ST(&(x[WS(rs, 7)]), VFMAI(T1f, T18), ms, &(x[WS(rs, 1)]));
174 ST(&(x[WS(rs, 9)]), VFNMSI(T1f, T18), ms, &(x[WS(rs, 1)]));
175 ST(&(x[WS(rs, 3)]), VFMAI(T1p, T1o), ms, &(x[WS(rs, 1)]));
176 ST(&(x[WS(rs, 13)]), VFNMSI(T1p, T1o), ms, &(x[WS(rs, 1)]));
177 ST(&(x[WS(rs, 11)]), VFMAI(T1n, T1k), ms, &(x[WS(rs, 1)]));
178 ST(&(x[WS(rs, 5)]), VFNMSI(T1n, T1k), ms, &(x[WS(rs, 1)]));
179 ST(&(x[WS(rs, 14)]), VFNMSI(TN, TM), ms, &(x[0]));
180 ST(&(x[WS(rs, 2)]), VFMAI(TN, TM), ms, &(x[0]));
181 ST(&(x[WS(rs, 10)]), VFMAI(TL, Ty), ms, &(x[0]));
182 ST(&(x[WS(rs, 6)]), VFNMSI(TL, Ty), ms, &(x[0]));
183 }
184 }
185 }
186 }
187 }
188 }
189 }
190 }
191 VLEAVE();
192 }
193
194 static const tw_instr twinstr[] = {
195 VTW(0, 1),
196 VTW(0, 2),
197 VTW(0, 3),
198 VTW(0, 4),
199 VTW(0, 5),
200 VTW(0, 6),
201 VTW(0, 7),
202 VTW(0, 8),
203 VTW(0, 9),
204 VTW(0, 10),
205 VTW(0, 11),
206 VTW(0, 12),
207 VTW(0, 13),
208 VTW(0, 14),
209 VTW(0, 15),
210 {TW_NEXT, VL, 0}
211 };
212
213 static const ct_desc desc = { 16, XSIMD_STRING("t2fv_16"), twinstr, &GENUS, {53, 30, 34, 0}, 0, 0, 0 };
214
215 void XSIMD(codelet_t2fv_16) (planner *p) {
216 X(kdft_dit_register) (p, t2fv_16, &desc);
217 }
218 #else /* HAVE_FMA */
219
220 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t2fv_16 -include t2f.h */
221
222 /*
223 * This function contains 87 FP additions, 42 FP multiplications,
224 * (or, 83 additions, 38 multiplications, 4 fused multiply/add),
225 * 36 stack variables, 3 constants, and 32 memory accesses
226 */
227 #include "t2f.h"
228
229 static void t2fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
230 {
231 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
232 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
233 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
234 {
235 INT m;
236 R *x;
237 x = ri;
238 for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
239 V TJ, T10, TD, T11, T1b, T1c, Ty, TK, T16, T17, T18, Tb, TN, T13, T14;
240 V T15, Tm, TM, TG, TI, TH;
241 TG = LD(&(x[0]), ms, &(x[0]));
242 TH = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
243 TI = BYTWJ(&(W[TWVL * 14]), TH);
244 TJ = VSUB(TG, TI);
245 T10 = VADD(TG, TI);
246 {
247 V TA, TC, Tz, TB;
248 Tz = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
249 TA = BYTWJ(&(W[TWVL * 6]), Tz);
250 TB = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
251 TC = BYTWJ(&(W[TWVL * 22]), TB);
252 TD = VSUB(TA, TC);
253 T11 = VADD(TA, TC);
254 }
255 {
256 V Tp, Tw, Tr, Tu, Ts, Tx;
257 {
258 V To, Tv, Tq, Tt;
259 To = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
260 Tp = BYTWJ(&(W[TWVL * 26]), To);
261 Tv = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
262 Tw = BYTWJ(&(W[TWVL * 18]), Tv);
263 Tq = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
264 Tr = BYTWJ(&(W[TWVL * 10]), Tq);
265 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
266 Tu = BYTWJ(&(W[TWVL * 2]), Tt);
267 }
268 T1b = VADD(Tp, Tr);
269 T1c = VADD(Tu, Tw);
270 Ts = VSUB(Tp, Tr);
271 Tx = VSUB(Tu, Tw);
272 Ty = VMUL(LDK(KP707106781), VSUB(Ts, Tx));
273 TK = VMUL(LDK(KP707106781), VADD(Tx, Ts));
274 }
275 {
276 V T2, T9, T4, T7, T5, Ta;
277 {
278 V T1, T8, T3, T6;
279 T1 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
280 T2 = BYTWJ(&(W[TWVL * 28]), T1);
281 T8 = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
282 T9 = BYTWJ(&(W[TWVL * 20]), T8);
283 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
284 T4 = BYTWJ(&(W[TWVL * 12]), T3);
285 T6 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
286 T7 = BYTWJ(&(W[TWVL * 4]), T6);
287 }
288 T16 = VADD(T2, T4);
289 T17 = VADD(T7, T9);
290 T18 = VSUB(T16, T17);
291 T5 = VSUB(T2, T4);
292 Ta = VSUB(T7, T9);
293 Tb = VFNMS(LDK(KP923879532), Ta, VMUL(LDK(KP382683432), T5));
294 TN = VFMA(LDK(KP923879532), T5, VMUL(LDK(KP382683432), Ta));
295 }
296 {
297 V Td, Tk, Tf, Ti, Tg, Tl;
298 {
299 V Tc, Tj, Te, Th;
300 Tc = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
301 Td = BYTWJ(&(W[0]), Tc);
302 Tj = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
303 Tk = BYTWJ(&(W[TWVL * 24]), Tj);
304 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
305 Tf = BYTWJ(&(W[TWVL * 16]), Te);
306 Th = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
307 Ti = BYTWJ(&(W[TWVL * 8]), Th);
308 }
309 T13 = VADD(Td, Tf);
310 T14 = VADD(Ti, Tk);
311 T15 = VSUB(T13, T14);
312 Tg = VSUB(Td, Tf);
313 Tl = VSUB(Ti, Tk);
314 Tm = VFMA(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), Tl));
315 TM = VFNMS(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tg));
316 }
317 {
318 V T1a, T1g, T1f, T1h;
319 {
320 V T12, T19, T1d, T1e;
321 T12 = VSUB(T10, T11);
322 T19 = VMUL(LDK(KP707106781), VADD(T15, T18));
323 T1a = VADD(T12, T19);
324 T1g = VSUB(T12, T19);
325 T1d = VSUB(T1b, T1c);
326 T1e = VMUL(LDK(KP707106781), VSUB(T18, T15));
327 T1f = VBYI(VADD(T1d, T1e));
328 T1h = VBYI(VSUB(T1e, T1d));
329 }
330 ST(&(x[WS(rs, 14)]), VSUB(T1a, T1f), ms, &(x[0]));
331 ST(&(x[WS(rs, 6)]), VADD(T1g, T1h), ms, &(x[0]));
332 ST(&(x[WS(rs, 2)]), VADD(T1a, T1f), ms, &(x[0]));
333 ST(&(x[WS(rs, 10)]), VSUB(T1g, T1h), ms, &(x[0]));
334 }
335 {
336 V T1k, T1o, T1n, T1p;
337 {
338 V T1i, T1j, T1l, T1m;
339 T1i = VADD(T10, T11);
340 T1j = VADD(T1c, T1b);
341 T1k = VADD(T1i, T1j);
342 T1o = VSUB(T1i, T1j);
343 T1l = VADD(T13, T14);
344 T1m = VADD(T16, T17);
345 T1n = VADD(T1l, T1m);
346 T1p = VBYI(VSUB(T1m, T1l));
347 }
348 ST(&(x[WS(rs, 8)]), VSUB(T1k, T1n), ms, &(x[0]));
349 ST(&(x[WS(rs, 4)]), VADD(T1o, T1p), ms, &(x[0]));
350 ST(&(x[0]), VADD(T1k, T1n), ms, &(x[0]));
351 ST(&(x[WS(rs, 12)]), VSUB(T1o, T1p), ms, &(x[0]));
352 }
353 {
354 V TF, TQ, TP, TR;
355 {
356 V Tn, TE, TL, TO;
357 Tn = VSUB(Tb, Tm);
358 TE = VSUB(Ty, TD);
359 TF = VBYI(VSUB(Tn, TE));
360 TQ = VBYI(VADD(TE, Tn));
361 TL = VADD(TJ, TK);
362 TO = VADD(TM, TN);
363 TP = VSUB(TL, TO);
364 TR = VADD(TL, TO);
365 }
366 ST(&(x[WS(rs, 7)]), VADD(TF, TP), ms, &(x[WS(rs, 1)]));
367 ST(&(x[WS(rs, 15)]), VSUB(TR, TQ), ms, &(x[WS(rs, 1)]));
368 ST(&(x[WS(rs, 9)]), VSUB(TP, TF), ms, &(x[WS(rs, 1)]));
369 ST(&(x[WS(rs, 1)]), VADD(TQ, TR), ms, &(x[WS(rs, 1)]));
370 }
371 {
372 V TU, TY, TX, TZ;
373 {
374 V TS, TT, TV, TW;
375 TS = VSUB(TJ, TK);
376 TT = VADD(Tm, Tb);
377 TU = VADD(TS, TT);
378 TY = VSUB(TS, TT);
379 TV = VADD(TD, Ty);
380 TW = VSUB(TN, TM);
381 TX = VBYI(VADD(TV, TW));
382 TZ = VBYI(VSUB(TW, TV));
383 }
384 ST(&(x[WS(rs, 13)]), VSUB(TU, TX), ms, &(x[WS(rs, 1)]));
385 ST(&(x[WS(rs, 5)]), VADD(TY, TZ), ms, &(x[WS(rs, 1)]));
386 ST(&(x[WS(rs, 3)]), VADD(TU, TX), ms, &(x[WS(rs, 1)]));
387 ST(&(x[WS(rs, 11)]), VSUB(TY, TZ), ms, &(x[WS(rs, 1)]));
388 }
389 }
390 }
391 VLEAVE();
392 }
393
394 static const tw_instr twinstr[] = {
395 VTW(0, 1),
396 VTW(0, 2),
397 VTW(0, 3),
398 VTW(0, 4),
399 VTW(0, 5),
400 VTW(0, 6),
401 VTW(0, 7),
402 VTW(0, 8),
403 VTW(0, 9),
404 VTW(0, 10),
405 VTW(0, 11),
406 VTW(0, 12),
407 VTW(0, 13),
408 VTW(0, 14),
409 VTW(0, 15),
410 {TW_NEXT, VL, 0}
411 };
412
413 static const ct_desc desc = { 16, XSIMD_STRING("t2fv_16"), twinstr, &GENUS, {83, 38, 4, 0}, 0, 0, 0 };
414
415 void XSIMD(codelet_t2fv_16) (planner *p) {
416 X(kdft_dit_register) (p, t2fv_16, &desc);
417 }
418 #endif /* HAVE_FMA */