Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_12.c @ 10:37bf6b4a2645
Add FFTW3
author | Chris Cannam |
---|---|
date | Wed, 20 Mar 2013 15:35:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:c0fb53affa76 | 10:37bf6b4a2645 |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-11 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 /* This file was automatically generated --- DO NOT EDIT */ | |
22 /* Generated on Sun Nov 25 07:42:29 EST 2012 */ | |
23 | |
24 #include "codelet-rdft.h" | |
25 | |
26 #ifdef HAVE_FMA | |
27 | |
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dit -name hc2cfdftv_12 -include hc2cfv.h */ | |
29 | |
30 /* | |
31 * This function contains 71 FP additions, 66 FP multiplications, | |
32 * (or, 41 additions, 36 multiplications, 30 fused multiply/add), | |
33 * 86 stack variables, 2 constants, and 24 memory accesses | |
34 */ | |
35 #include "hc2cfv.h" | |
36 | |
37 static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) | |
38 { | |
39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627); | |
40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000); | |
41 { | |
42 INT m; | |
43 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) { | |
44 V T3, T7, TH, TE, Th, TC, Tq, T11, TU, Tx, Tb, Tz, Tu, Tw, Tp; | |
45 V Tl, T9, Ta, T8, Ty, Tn, To, Tm, TG, T1, T2, Tt, T5, T6, T4; | |
46 V Tv, Tj, Tk, Ti, TD, Tf, Tg, Te, TB, TT, TF, TR, Tr; | |
47 T1 = LD(&(Rp[0]), ms, &(Rp[0])); | |
48 T2 = LD(&(Rm[0]), -ms, &(Rm[0])); | |
49 Tt = LDW(&(W[0])); | |
50 T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0])); | |
51 T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0])); | |
52 T4 = LDW(&(W[TWVL * 6])); | |
53 Tv = LDW(&(W[TWVL * 8])); | |
54 Tn = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)])); | |
55 To = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)])); | |
56 T3 = VFMACONJ(T2, T1); | |
57 Tu = VZMULIJ(Tt, VFNMSCONJ(T2, T1)); | |
58 Tm = LDW(&(W[TWVL * 2])); | |
59 TG = LDW(&(W[TWVL * 4])); | |
60 T7 = VZMULJ(T4, VFMACONJ(T6, T5)); | |
61 Tw = VZMULIJ(Tv, VFNMSCONJ(T6, T5)); | |
62 Tj = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)])); | |
63 Tk = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)])); | |
64 Ti = LDW(&(W[TWVL * 18])); | |
65 TD = LDW(&(W[TWVL * 20])); | |
66 Tp = VZMULJ(Tm, VFMACONJ(To, Tn)); | |
67 TH = VZMULIJ(TG, VFNMSCONJ(To, Tn)); | |
68 Tf = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)])); | |
69 Tg = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)])); | |
70 Te = LDW(&(W[TWVL * 10])); | |
71 TB = LDW(&(W[TWVL * 12])); | |
72 Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj)); | |
73 TE = VZMULIJ(TD, VFNMSCONJ(Tk, Tj)); | |
74 T9 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0])); | |
75 Ta = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0])); | |
76 T8 = LDW(&(W[TWVL * 14])); | |
77 Ty = LDW(&(W[TWVL * 16])); | |
78 Th = VZMULJ(Te, VFMACONJ(Tg, Tf)); | |
79 TC = VZMULIJ(TB, VFNMSCONJ(Tg, Tf)); | |
80 Tq = VADD(Tl, Tp); | |
81 T11 = VSUB(Tp, Tl); | |
82 TU = VSUB(Tu, Tw); | |
83 Tx = VADD(Tu, Tw); | |
84 Tb = VZMULJ(T8, VFMACONJ(Ta, T9)); | |
85 Tz = VZMULIJ(Ty, VFNMSCONJ(Ta, T9)); | |
86 TT = VSUB(TC, TE); | |
87 TF = VADD(TC, TE); | |
88 TR = VFNMS(LDK(KP500000000), Tq, Th); | |
89 Tr = VADD(Th, Tq); | |
90 { | |
91 V TX, TA, T1d, TV, TY, TI, T1e, T12, TQ, Td, T10, Tc, T1a, TN, TJ; | |
92 V T1j, T1f, T1b, TS, TM, Ts, T17, T13, TZ, T1i, T1c, T16, TW, TP, TO; | |
93 V TL, TK, T1k, T1l, T1h, T1g, T18, T19, T15, T14; | |
94 T10 = VSUB(Tb, T7); | |
95 Tc = VADD(T7, Tb); | |
96 TX = VFNMS(LDK(KP500000000), Tx, Tz); | |
97 TA = VADD(Tx, Tz); | |
98 T1d = VADD(TU, TT); | |
99 TV = VSUB(TT, TU); | |
100 TY = VFNMS(LDK(KP500000000), TF, TH); | |
101 TI = VADD(TF, TH); | |
102 T1e = VADD(T10, T11); | |
103 T12 = VSUB(T10, T11); | |
104 TQ = VFNMS(LDK(KP500000000), Tc, T3); | |
105 Td = VADD(T3, Tc); | |
106 T1a = VADD(TX, TY); | |
107 TZ = VSUB(TX, TY); | |
108 TN = VADD(TA, TI); | |
109 TJ = VSUB(TA, TI); | |
110 T1j = VMUL(LDK(KP866025403), VADD(T1d, T1e)); | |
111 T1f = VMUL(LDK(KP866025403), VSUB(T1d, T1e)); | |
112 T1b = VADD(TQ, TR); | |
113 TS = VSUB(TQ, TR); | |
114 TM = VADD(Td, Tr); | |
115 Ts = VSUB(Td, Tr); | |
116 T17 = VFMA(LDK(KP866025403), T12, TZ); | |
117 T13 = VFNMS(LDK(KP866025403), T12, TZ); | |
118 T1i = VSUB(T1b, T1a); | |
119 T1c = VADD(T1a, T1b); | |
120 T16 = VFNMS(LDK(KP866025403), TV, TS); | |
121 TW = VFMA(LDK(KP866025403), TV, TS); | |
122 TP = VCONJ(VMUL(LDK(KP500000000), VADD(TN, TM))); | |
123 TO = VMUL(LDK(KP500000000), VSUB(TM, TN)); | |
124 TL = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TJ, Ts))); | |
125 TK = VMUL(LDK(KP500000000), VFMAI(TJ, Ts)); | |
126 T1k = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1j, T1i))); | |
127 T1l = VMUL(LDK(KP500000000), VFMAI(T1j, T1i)); | |
128 T1h = VMUL(LDK(KP500000000), VFMAI(T1f, T1c)); | |
129 T1g = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1f, T1c))); | |
130 T18 = VMUL(LDK(KP500000000), VFNMSI(T17, T16)); | |
131 T19 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T17, T16))); | |
132 T15 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T13, TW))); | |
133 T14 = VMUL(LDK(KP500000000), VFNMSI(T13, TW)); | |
134 ST(&(Rm[WS(rs, 5)]), TP, -ms, &(Rm[WS(rs, 1)])); | |
135 ST(&(Rp[0]), TO, ms, &(Rp[0])); | |
136 ST(&(Rm[WS(rs, 2)]), TL, -ms, &(Rm[0])); | |
137 ST(&(Rp[WS(rs, 3)]), TK, ms, &(Rp[WS(rs, 1)])); | |
138 ST(&(Rm[WS(rs, 3)]), T1k, -ms, &(Rm[WS(rs, 1)])); | |
139 ST(&(Rp[WS(rs, 4)]), T1l, ms, &(Rp[0])); | |
140 ST(&(Rp[WS(rs, 2)]), T1h, ms, &(Rp[0])); | |
141 ST(&(Rm[WS(rs, 1)]), T1g, -ms, &(Rm[WS(rs, 1)])); | |
142 ST(&(Rp[WS(rs, 5)]), T18, ms, &(Rp[WS(rs, 1)])); | |
143 ST(&(Rm[WS(rs, 4)]), T19, -ms, &(Rm[0])); | |
144 ST(&(Rm[0]), T15, -ms, &(Rm[0])); | |
145 ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)])); | |
146 } | |
147 } | |
148 } | |
149 VLEAVE(); | |
150 } | |
151 | |
152 static const tw_instr twinstr[] = { | |
153 VTW(1, 1), | |
154 VTW(1, 2), | |
155 VTW(1, 3), | |
156 VTW(1, 4), | |
157 VTW(1, 5), | |
158 VTW(1, 6), | |
159 VTW(1, 7), | |
160 VTW(1, 8), | |
161 VTW(1, 9), | |
162 VTW(1, 10), | |
163 VTW(1, 11), | |
164 {TW_NEXT, VL, 0} | |
165 }; | |
166 | |
167 static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cfdftv_12"), twinstr, &GENUS, {41, 36, 30, 0} }; | |
168 | |
169 void XSIMD(codelet_hc2cfdftv_12) (planner *p) { | |
170 X(khc2c_register) (p, hc2cfdftv_12, &desc, HC2C_VIA_DFT); | |
171 } | |
172 #else /* HAVE_FMA */ | |
173 | |
174 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dit -name hc2cfdftv_12 -include hc2cfv.h */ | |
175 | |
176 /* | |
177 * This function contains 71 FP additions, 41 FP multiplications, | |
178 * (or, 67 additions, 37 multiplications, 4 fused multiply/add), | |
179 * 58 stack variables, 4 constants, and 24 memory accesses | |
180 */ | |
181 #include "hc2cfv.h" | |
182 | |
183 static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) | |
184 { | |
185 DVK(KP433012701, +0.433012701892219323381861585376468091735701313); | |
186 DVK(KP866025403, +0.866025403784438646763723170752936183471402627); | |
187 DVK(KP250000000, +0.250000000000000000000000000000000000000000000); | |
188 DVK(KP500000000, +0.500000000000000000000000000000000000000000000); | |
189 { | |
190 INT m; | |
191 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) { | |
192 V TX, T13, T4, Tf, TZ, TD, TF, T17, TW, T14, Tw, Tl, T10, TL, TN; | |
193 V T16; | |
194 { | |
195 V T1, T3, TA, Tb, Td, Te, T9, TC, T2, Tz, Tc, Ta, T6, T8, T7; | |
196 V T5, TB, TE, Ti, Tk, TI, Ts, Tu, Tv, Tq, TK, Tj, TH, Tt, Tr; | |
197 V Tn, Tp, To, Tm, TJ, Th, TM; | |
198 T1 = LD(&(Rp[0]), ms, &(Rp[0])); | |
199 T2 = LD(&(Rm[0]), -ms, &(Rm[0])); | |
200 T3 = VCONJ(T2); | |
201 Tz = LDW(&(W[0])); | |
202 TA = VZMULIJ(Tz, VSUB(T3, T1)); | |
203 Tb = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0])); | |
204 Tc = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0])); | |
205 Td = VCONJ(Tc); | |
206 Ta = LDW(&(W[TWVL * 14])); | |
207 Te = VZMULJ(Ta, VADD(Tb, Td)); | |
208 T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0])); | |
209 T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0])); | |
210 T8 = VCONJ(T7); | |
211 T5 = LDW(&(W[TWVL * 6])); | |
212 T9 = VZMULJ(T5, VADD(T6, T8)); | |
213 TB = LDW(&(W[TWVL * 8])); | |
214 TC = VZMULIJ(TB, VSUB(T8, T6)); | |
215 TX = VSUB(TC, TA); | |
216 T13 = VSUB(Te, T9); | |
217 T4 = VADD(T1, T3); | |
218 Tf = VADD(T9, Te); | |
219 TZ = VFNMS(LDK(KP250000000), Tf, VMUL(LDK(KP500000000), T4)); | |
220 TD = VADD(TA, TC); | |
221 TE = LDW(&(W[TWVL * 16])); | |
222 TF = VZMULIJ(TE, VSUB(Td, Tb)); | |
223 T17 = VFNMS(LDK(KP500000000), TD, TF); | |
224 Ti = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)])); | |
225 Tj = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)])); | |
226 Tk = VCONJ(Tj); | |
227 TH = LDW(&(W[TWVL * 12])); | |
228 TI = VZMULIJ(TH, VSUB(Tk, Ti)); | |
229 Ts = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)])); | |
230 Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)])); | |
231 Tu = VCONJ(Tt); | |
232 Tr = LDW(&(W[TWVL * 2])); | |
233 Tv = VZMULJ(Tr, VADD(Ts, Tu)); | |
234 Tn = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)])); | |
235 To = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)])); | |
236 Tp = VCONJ(To); | |
237 Tm = LDW(&(W[TWVL * 18])); | |
238 Tq = VZMULJ(Tm, VADD(Tn, Tp)); | |
239 TJ = LDW(&(W[TWVL * 20])); | |
240 TK = VZMULIJ(TJ, VSUB(Tp, Tn)); | |
241 TW = VSUB(TK, TI); | |
242 T14 = VSUB(Tv, Tq); | |
243 Tw = VADD(Tq, Tv); | |
244 Th = LDW(&(W[TWVL * 10])); | |
245 Tl = VZMULJ(Th, VADD(Ti, Tk)); | |
246 T10 = VFNMS(LDK(KP250000000), Tw, VMUL(LDK(KP500000000), Tl)); | |
247 TL = VADD(TI, TK); | |
248 TM = LDW(&(W[TWVL * 4])); | |
249 TN = VZMULIJ(TM, VSUB(Tu, Ts)); | |
250 T16 = VFNMS(LDK(KP500000000), TL, TN); | |
251 } | |
252 { | |
253 V Ty, TS, TP, TT, Tg, Tx, TG, TO, TQ, TV, TR, TU, T1i, T1o, T1l; | |
254 V T1p, T1g, T1h, T1j, T1k, T1m, T1r, T1n, T1q, T12, T1c, T19, T1d, TY, T11; | |
255 V T15, T18, T1a, T1f, T1b, T1e; | |
256 Tg = VADD(T4, Tf); | |
257 Tx = VADD(Tl, Tw); | |
258 Ty = VADD(Tg, Tx); | |
259 TS = VSUB(Tg, Tx); | |
260 TG = VADD(TD, TF); | |
261 TO = VADD(TL, TN); | |
262 TP = VADD(TG, TO); | |
263 TT = VBYI(VSUB(TO, TG)); | |
264 TQ = VCONJ(VMUL(LDK(KP500000000), VSUB(Ty, TP))); | |
265 ST(&(Rm[WS(rs, 5)]), TQ, -ms, &(Rm[WS(rs, 1)])); | |
266 TV = VMUL(LDK(KP500000000), VADD(TS, TT)); | |
267 ST(&(Rp[WS(rs, 3)]), TV, ms, &(Rp[WS(rs, 1)])); | |
268 TR = VMUL(LDK(KP500000000), VADD(Ty, TP)); | |
269 ST(&(Rp[0]), TR, ms, &(Rp[0])); | |
270 TU = VCONJ(VMUL(LDK(KP500000000), VSUB(TS, TT))); | |
271 ST(&(Rm[WS(rs, 2)]), TU, -ms, &(Rm[0])); | |
272 T1g = VADD(TX, TW); | |
273 T1h = VADD(T13, T14); | |
274 T1i = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(T1g, T1h)))); | |
275 T1o = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VADD(T1g, T1h)))); | |
276 T1j = VADD(TZ, T10); | |
277 T1k = VMUL(LDK(KP500000000), VADD(T17, T16)); | |
278 T1l = VSUB(T1j, T1k); | |
279 T1p = VADD(T1j, T1k); | |
280 T1m = VADD(T1i, T1l); | |
281 ST(&(Rp[WS(rs, 2)]), T1m, ms, &(Rp[0])); | |
282 T1r = VCONJ(VSUB(T1p, T1o)); | |
283 ST(&(Rm[WS(rs, 3)]), T1r, -ms, &(Rm[WS(rs, 1)])); | |
284 T1n = VCONJ(VSUB(T1l, T1i)); | |
285 ST(&(Rm[WS(rs, 1)]), T1n, -ms, &(Rm[WS(rs, 1)])); | |
286 T1q = VADD(T1o, T1p); | |
287 ST(&(Rp[WS(rs, 4)]), T1q, ms, &(Rp[0])); | |
288 TY = VMUL(LDK(KP433012701), VSUB(TW, TX)); | |
289 T11 = VSUB(TZ, T10); | |
290 T12 = VADD(TY, T11); | |
291 T1c = VSUB(T11, TY); | |
292 T15 = VMUL(LDK(KP866025403), VSUB(T13, T14)); | |
293 T18 = VSUB(T16, T17); | |
294 T19 = VMUL(LDK(KP500000000), VBYI(VSUB(T15, T18))); | |
295 T1d = VMUL(LDK(KP500000000), VBYI(VADD(T15, T18))); | |
296 T1a = VCONJ(VSUB(T12, T19)); | |
297 ST(&(Rm[0]), T1a, -ms, &(Rm[0])); | |
298 T1f = VCONJ(VADD(T1c, T1d)); | |
299 ST(&(Rm[WS(rs, 4)]), T1f, -ms, &(Rm[0])); | |
300 T1b = VADD(T12, T19); | |
301 ST(&(Rp[WS(rs, 1)]), T1b, ms, &(Rp[WS(rs, 1)])); | |
302 T1e = VSUB(T1c, T1d); | |
303 ST(&(Rp[WS(rs, 5)]), T1e, ms, &(Rp[WS(rs, 1)])); | |
304 } | |
305 } | |
306 } | |
307 VLEAVE(); | |
308 } | |
309 | |
310 static const tw_instr twinstr[] = { | |
311 VTW(1, 1), | |
312 VTW(1, 2), | |
313 VTW(1, 3), | |
314 VTW(1, 4), | |
315 VTW(1, 5), | |
316 VTW(1, 6), | |
317 VTW(1, 7), | |
318 VTW(1, 8), | |
319 VTW(1, 9), | |
320 VTW(1, 10), | |
321 VTW(1, 11), | |
322 {TW_NEXT, VL, 0} | |
323 }; | |
324 | |
325 static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cfdftv_12"), twinstr, &GENUS, {67, 37, 4, 0} }; | |
326 | |
327 void XSIMD(codelet_hc2cfdftv_12) (planner *p) { | |
328 X(khc2c_register) (p, hc2cfdftv_12, &desc, HC2C_VIA_DFT); | |
329 } | |
330 #endif /* HAVE_FMA */ |