comparison src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_12.c @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Sun Nov 25 07:42:29 EST 2012 */
23
24 #include "codelet-rdft.h"
25
26 #ifdef HAVE_FMA
27
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dit -name hc2cfdftv_12 -include hc2cfv.h */
29
30 /*
31 * This function contains 71 FP additions, 66 FP multiplications,
32 * (or, 41 additions, 36 multiplications, 30 fused multiply/add),
33 * 86 stack variables, 2 constants, and 24 memory accesses
34 */
35 #include "hc2cfv.h"
36
37 static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
41 {
42 INT m;
43 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
44 V T3, T7, TH, TE, Th, TC, Tq, T11, TU, Tx, Tb, Tz, Tu, Tw, Tp;
45 V Tl, T9, Ta, T8, Ty, Tn, To, Tm, TG, T1, T2, Tt, T5, T6, T4;
46 V Tv, Tj, Tk, Ti, TD, Tf, Tg, Te, TB, TT, TF, TR, Tr;
47 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
48 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
49 Tt = LDW(&(W[0]));
50 T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
51 T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
52 T4 = LDW(&(W[TWVL * 6]));
53 Tv = LDW(&(W[TWVL * 8]));
54 Tn = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
55 To = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
56 T3 = VFMACONJ(T2, T1);
57 Tu = VZMULIJ(Tt, VFNMSCONJ(T2, T1));
58 Tm = LDW(&(W[TWVL * 2]));
59 TG = LDW(&(W[TWVL * 4]));
60 T7 = VZMULJ(T4, VFMACONJ(T6, T5));
61 Tw = VZMULIJ(Tv, VFNMSCONJ(T6, T5));
62 Tj = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
63 Tk = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
64 Ti = LDW(&(W[TWVL * 18]));
65 TD = LDW(&(W[TWVL * 20]));
66 Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
67 TH = VZMULIJ(TG, VFNMSCONJ(To, Tn));
68 Tf = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
69 Tg = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
70 Te = LDW(&(W[TWVL * 10]));
71 TB = LDW(&(W[TWVL * 12]));
72 Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
73 TE = VZMULIJ(TD, VFNMSCONJ(Tk, Tj));
74 T9 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
75 Ta = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
76 T8 = LDW(&(W[TWVL * 14]));
77 Ty = LDW(&(W[TWVL * 16]));
78 Th = VZMULJ(Te, VFMACONJ(Tg, Tf));
79 TC = VZMULIJ(TB, VFNMSCONJ(Tg, Tf));
80 Tq = VADD(Tl, Tp);
81 T11 = VSUB(Tp, Tl);
82 TU = VSUB(Tu, Tw);
83 Tx = VADD(Tu, Tw);
84 Tb = VZMULJ(T8, VFMACONJ(Ta, T9));
85 Tz = VZMULIJ(Ty, VFNMSCONJ(Ta, T9));
86 TT = VSUB(TC, TE);
87 TF = VADD(TC, TE);
88 TR = VFNMS(LDK(KP500000000), Tq, Th);
89 Tr = VADD(Th, Tq);
90 {
91 V TX, TA, T1d, TV, TY, TI, T1e, T12, TQ, Td, T10, Tc, T1a, TN, TJ;
92 V T1j, T1f, T1b, TS, TM, Ts, T17, T13, TZ, T1i, T1c, T16, TW, TP, TO;
93 V TL, TK, T1k, T1l, T1h, T1g, T18, T19, T15, T14;
94 T10 = VSUB(Tb, T7);
95 Tc = VADD(T7, Tb);
96 TX = VFNMS(LDK(KP500000000), Tx, Tz);
97 TA = VADD(Tx, Tz);
98 T1d = VADD(TU, TT);
99 TV = VSUB(TT, TU);
100 TY = VFNMS(LDK(KP500000000), TF, TH);
101 TI = VADD(TF, TH);
102 T1e = VADD(T10, T11);
103 T12 = VSUB(T10, T11);
104 TQ = VFNMS(LDK(KP500000000), Tc, T3);
105 Td = VADD(T3, Tc);
106 T1a = VADD(TX, TY);
107 TZ = VSUB(TX, TY);
108 TN = VADD(TA, TI);
109 TJ = VSUB(TA, TI);
110 T1j = VMUL(LDK(KP866025403), VADD(T1d, T1e));
111 T1f = VMUL(LDK(KP866025403), VSUB(T1d, T1e));
112 T1b = VADD(TQ, TR);
113 TS = VSUB(TQ, TR);
114 TM = VADD(Td, Tr);
115 Ts = VSUB(Td, Tr);
116 T17 = VFMA(LDK(KP866025403), T12, TZ);
117 T13 = VFNMS(LDK(KP866025403), T12, TZ);
118 T1i = VSUB(T1b, T1a);
119 T1c = VADD(T1a, T1b);
120 T16 = VFNMS(LDK(KP866025403), TV, TS);
121 TW = VFMA(LDK(KP866025403), TV, TS);
122 TP = VCONJ(VMUL(LDK(KP500000000), VADD(TN, TM)));
123 TO = VMUL(LDK(KP500000000), VSUB(TM, TN));
124 TL = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TJ, Ts)));
125 TK = VMUL(LDK(KP500000000), VFMAI(TJ, Ts));
126 T1k = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1j, T1i)));
127 T1l = VMUL(LDK(KP500000000), VFMAI(T1j, T1i));
128 T1h = VMUL(LDK(KP500000000), VFMAI(T1f, T1c));
129 T1g = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1f, T1c)));
130 T18 = VMUL(LDK(KP500000000), VFNMSI(T17, T16));
131 T19 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T17, T16)));
132 T15 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T13, TW)));
133 T14 = VMUL(LDK(KP500000000), VFNMSI(T13, TW));
134 ST(&(Rm[WS(rs, 5)]), TP, -ms, &(Rm[WS(rs, 1)]));
135 ST(&(Rp[0]), TO, ms, &(Rp[0]));
136 ST(&(Rm[WS(rs, 2)]), TL, -ms, &(Rm[0]));
137 ST(&(Rp[WS(rs, 3)]), TK, ms, &(Rp[WS(rs, 1)]));
138 ST(&(Rm[WS(rs, 3)]), T1k, -ms, &(Rm[WS(rs, 1)]));
139 ST(&(Rp[WS(rs, 4)]), T1l, ms, &(Rp[0]));
140 ST(&(Rp[WS(rs, 2)]), T1h, ms, &(Rp[0]));
141 ST(&(Rm[WS(rs, 1)]), T1g, -ms, &(Rm[WS(rs, 1)]));
142 ST(&(Rp[WS(rs, 5)]), T18, ms, &(Rp[WS(rs, 1)]));
143 ST(&(Rm[WS(rs, 4)]), T19, -ms, &(Rm[0]));
144 ST(&(Rm[0]), T15, -ms, &(Rm[0]));
145 ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)]));
146 }
147 }
148 }
149 VLEAVE();
150 }
151
152 static const tw_instr twinstr[] = {
153 VTW(1, 1),
154 VTW(1, 2),
155 VTW(1, 3),
156 VTW(1, 4),
157 VTW(1, 5),
158 VTW(1, 6),
159 VTW(1, 7),
160 VTW(1, 8),
161 VTW(1, 9),
162 VTW(1, 10),
163 VTW(1, 11),
164 {TW_NEXT, VL, 0}
165 };
166
167 static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cfdftv_12"), twinstr, &GENUS, {41, 36, 30, 0} };
168
169 void XSIMD(codelet_hc2cfdftv_12) (planner *p) {
170 X(khc2c_register) (p, hc2cfdftv_12, &desc, HC2C_VIA_DFT);
171 }
172 #else /* HAVE_FMA */
173
174 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dit -name hc2cfdftv_12 -include hc2cfv.h */
175
176 /*
177 * This function contains 71 FP additions, 41 FP multiplications,
178 * (or, 67 additions, 37 multiplications, 4 fused multiply/add),
179 * 58 stack variables, 4 constants, and 24 memory accesses
180 */
181 #include "hc2cfv.h"
182
183 static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
184 {
185 DVK(KP433012701, +0.433012701892219323381861585376468091735701313);
186 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
187 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
188 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
189 {
190 INT m;
191 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
192 V TX, T13, T4, Tf, TZ, TD, TF, T17, TW, T14, Tw, Tl, T10, TL, TN;
193 V T16;
194 {
195 V T1, T3, TA, Tb, Td, Te, T9, TC, T2, Tz, Tc, Ta, T6, T8, T7;
196 V T5, TB, TE, Ti, Tk, TI, Ts, Tu, Tv, Tq, TK, Tj, TH, Tt, Tr;
197 V Tn, Tp, To, Tm, TJ, Th, TM;
198 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
199 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
200 T3 = VCONJ(T2);
201 Tz = LDW(&(W[0]));
202 TA = VZMULIJ(Tz, VSUB(T3, T1));
203 Tb = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
204 Tc = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
205 Td = VCONJ(Tc);
206 Ta = LDW(&(W[TWVL * 14]));
207 Te = VZMULJ(Ta, VADD(Tb, Td));
208 T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
209 T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
210 T8 = VCONJ(T7);
211 T5 = LDW(&(W[TWVL * 6]));
212 T9 = VZMULJ(T5, VADD(T6, T8));
213 TB = LDW(&(W[TWVL * 8]));
214 TC = VZMULIJ(TB, VSUB(T8, T6));
215 TX = VSUB(TC, TA);
216 T13 = VSUB(Te, T9);
217 T4 = VADD(T1, T3);
218 Tf = VADD(T9, Te);
219 TZ = VFNMS(LDK(KP250000000), Tf, VMUL(LDK(KP500000000), T4));
220 TD = VADD(TA, TC);
221 TE = LDW(&(W[TWVL * 16]));
222 TF = VZMULIJ(TE, VSUB(Td, Tb));
223 T17 = VFNMS(LDK(KP500000000), TD, TF);
224 Ti = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
225 Tj = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
226 Tk = VCONJ(Tj);
227 TH = LDW(&(W[TWVL * 12]));
228 TI = VZMULIJ(TH, VSUB(Tk, Ti));
229 Ts = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
230 Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
231 Tu = VCONJ(Tt);
232 Tr = LDW(&(W[TWVL * 2]));
233 Tv = VZMULJ(Tr, VADD(Ts, Tu));
234 Tn = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
235 To = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
236 Tp = VCONJ(To);
237 Tm = LDW(&(W[TWVL * 18]));
238 Tq = VZMULJ(Tm, VADD(Tn, Tp));
239 TJ = LDW(&(W[TWVL * 20]));
240 TK = VZMULIJ(TJ, VSUB(Tp, Tn));
241 TW = VSUB(TK, TI);
242 T14 = VSUB(Tv, Tq);
243 Tw = VADD(Tq, Tv);
244 Th = LDW(&(W[TWVL * 10]));
245 Tl = VZMULJ(Th, VADD(Ti, Tk));
246 T10 = VFNMS(LDK(KP250000000), Tw, VMUL(LDK(KP500000000), Tl));
247 TL = VADD(TI, TK);
248 TM = LDW(&(W[TWVL * 4]));
249 TN = VZMULIJ(TM, VSUB(Tu, Ts));
250 T16 = VFNMS(LDK(KP500000000), TL, TN);
251 }
252 {
253 V Ty, TS, TP, TT, Tg, Tx, TG, TO, TQ, TV, TR, TU, T1i, T1o, T1l;
254 V T1p, T1g, T1h, T1j, T1k, T1m, T1r, T1n, T1q, T12, T1c, T19, T1d, TY, T11;
255 V T15, T18, T1a, T1f, T1b, T1e;
256 Tg = VADD(T4, Tf);
257 Tx = VADD(Tl, Tw);
258 Ty = VADD(Tg, Tx);
259 TS = VSUB(Tg, Tx);
260 TG = VADD(TD, TF);
261 TO = VADD(TL, TN);
262 TP = VADD(TG, TO);
263 TT = VBYI(VSUB(TO, TG));
264 TQ = VCONJ(VMUL(LDK(KP500000000), VSUB(Ty, TP)));
265 ST(&(Rm[WS(rs, 5)]), TQ, -ms, &(Rm[WS(rs, 1)]));
266 TV = VMUL(LDK(KP500000000), VADD(TS, TT));
267 ST(&(Rp[WS(rs, 3)]), TV, ms, &(Rp[WS(rs, 1)]));
268 TR = VMUL(LDK(KP500000000), VADD(Ty, TP));
269 ST(&(Rp[0]), TR, ms, &(Rp[0]));
270 TU = VCONJ(VMUL(LDK(KP500000000), VSUB(TS, TT)));
271 ST(&(Rm[WS(rs, 2)]), TU, -ms, &(Rm[0]));
272 T1g = VADD(TX, TW);
273 T1h = VADD(T13, T14);
274 T1i = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(T1g, T1h))));
275 T1o = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VADD(T1g, T1h))));
276 T1j = VADD(TZ, T10);
277 T1k = VMUL(LDK(KP500000000), VADD(T17, T16));
278 T1l = VSUB(T1j, T1k);
279 T1p = VADD(T1j, T1k);
280 T1m = VADD(T1i, T1l);
281 ST(&(Rp[WS(rs, 2)]), T1m, ms, &(Rp[0]));
282 T1r = VCONJ(VSUB(T1p, T1o));
283 ST(&(Rm[WS(rs, 3)]), T1r, -ms, &(Rm[WS(rs, 1)]));
284 T1n = VCONJ(VSUB(T1l, T1i));
285 ST(&(Rm[WS(rs, 1)]), T1n, -ms, &(Rm[WS(rs, 1)]));
286 T1q = VADD(T1o, T1p);
287 ST(&(Rp[WS(rs, 4)]), T1q, ms, &(Rp[0]));
288 TY = VMUL(LDK(KP433012701), VSUB(TW, TX));
289 T11 = VSUB(TZ, T10);
290 T12 = VADD(TY, T11);
291 T1c = VSUB(T11, TY);
292 T15 = VMUL(LDK(KP866025403), VSUB(T13, T14));
293 T18 = VSUB(T16, T17);
294 T19 = VMUL(LDK(KP500000000), VBYI(VSUB(T15, T18)));
295 T1d = VMUL(LDK(KP500000000), VBYI(VADD(T15, T18)));
296 T1a = VCONJ(VSUB(T12, T19));
297 ST(&(Rm[0]), T1a, -ms, &(Rm[0]));
298 T1f = VCONJ(VADD(T1c, T1d));
299 ST(&(Rm[WS(rs, 4)]), T1f, -ms, &(Rm[0]));
300 T1b = VADD(T12, T19);
301 ST(&(Rp[WS(rs, 1)]), T1b, ms, &(Rp[WS(rs, 1)]));
302 T1e = VSUB(T1c, T1d);
303 ST(&(Rp[WS(rs, 5)]), T1e, ms, &(Rp[WS(rs, 1)]));
304 }
305 }
306 }
307 VLEAVE();
308 }
309
310 static const tw_instr twinstr[] = {
311 VTW(1, 1),
312 VTW(1, 2),
313 VTW(1, 3),
314 VTW(1, 4),
315 VTW(1, 5),
316 VTW(1, 6),
317 VTW(1, 7),
318 VTW(1, 8),
319 VTW(1, 9),
320 VTW(1, 10),
321 VTW(1, 11),
322 {TW_NEXT, VL, 0}
323 };
324
325 static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cfdftv_12"), twinstr, &GENUS, {67, 37, 4, 0} };
326
327 void XSIMD(codelet_hc2cfdftv_12) (planner *p) {
328 X(khc2c_register) (p, hc2cfdftv_12, &desc, HC2C_VIA_DFT);
329 }
330 #endif /* HAVE_FMA */