Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.3/dft/simd/common/n2bv_12.c @ 10:37bf6b4a2645
Add FFTW3
author | Chris Cannam |
---|---|
date | Wed, 20 Mar 2013 15:35:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:c0fb53affa76 | 10:37bf6b4a2645 |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-11 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 /* This file was automatically generated --- DO NOT EDIT */ | |
22 /* Generated on Sun Nov 25 07:37:30 EST 2012 */ | |
23 | |
24 #include "codelet-dft.h" | |
25 | |
26 #ifdef HAVE_FMA | |
27 | |
28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 12 -name n2bv_12 -with-ostride 2 -include n2b.h -store-multiple 2 */ | |
29 | |
30 /* | |
31 * This function contains 48 FP additions, 20 FP multiplications, | |
32 * (or, 30 additions, 2 multiplications, 18 fused multiply/add), | |
33 * 61 stack variables, 2 constants, and 30 memory accesses | |
34 */ | |
35 #include "n2b.h" | |
36 | |
37 static void n2bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) | |
38 { | |
39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627); | |
40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000); | |
41 { | |
42 INT i; | |
43 const R *xi; | |
44 R *xo; | |
45 xi = ii; | |
46 xo = io; | |
47 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) { | |
48 V T1, T6, Tc, Th, Td, Te, Ti, Tz, T4, TA, T9, Tj, Tf, Tw; | |
49 { | |
50 V T2, T3, T7, T8; | |
51 T1 = LD(&(xi[0]), ivs, &(xi[0])); | |
52 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); | |
53 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); | |
54 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); | |
55 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0])); | |
56 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); | |
57 Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); | |
58 Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); | |
59 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); | |
60 Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)])); | |
61 Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); | |
62 Tz = VSUB(T2, T3); | |
63 T4 = VADD(T2, T3); | |
64 TA = VSUB(T7, T8); | |
65 T9 = VADD(T7, T8); | |
66 Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); | |
67 } | |
68 Tf = VADD(Td, Te); | |
69 Tw = VSUB(Td, Te); | |
70 { | |
71 V T5, Tp, TJ, TB, Ta, Tq, Tk, Tx, Tg, Ts; | |
72 T5 = VADD(T1, T4); | |
73 Tp = VFNMS(LDK(KP500000000), T4, T1); | |
74 TJ = VSUB(Tz, TA); | |
75 TB = VADD(Tz, TA); | |
76 Ta = VADD(T6, T9); | |
77 Tq = VFNMS(LDK(KP500000000), T9, T6); | |
78 Tk = VADD(Ti, Tj); | |
79 Tx = VSUB(Tj, Ti); | |
80 Tg = VADD(Tc, Tf); | |
81 Ts = VFNMS(LDK(KP500000000), Tf, Tc); | |
82 { | |
83 V Tr, TF, Tb, Tn, TG, Ty, Tl, Tt; | |
84 Tr = VADD(Tp, Tq); | |
85 TF = VSUB(Tp, Tq); | |
86 Tb = VSUB(T5, Ta); | |
87 Tn = VADD(T5, Ta); | |
88 TG = VADD(Tw, Tx); | |
89 Ty = VSUB(Tw, Tx); | |
90 Tl = VADD(Th, Tk); | |
91 Tt = VFNMS(LDK(KP500000000), Tk, Th); | |
92 { | |
93 V TC, TE, TH, TL, Tu, TI, Tm, To; | |
94 TC = VMUL(LDK(KP866025403), VSUB(Ty, TB)); | |
95 TE = VMUL(LDK(KP866025403), VADD(TB, Ty)); | |
96 TH = VFNMS(LDK(KP866025403), TG, TF); | |
97 TL = VFMA(LDK(KP866025403), TG, TF); | |
98 Tu = VADD(Ts, Tt); | |
99 TI = VSUB(Ts, Tt); | |
100 Tm = VSUB(Tg, Tl); | |
101 To = VADD(Tg, Tl); | |
102 { | |
103 V TK, TM, Tv, TD; | |
104 TK = VFMA(LDK(KP866025403), TJ, TI); | |
105 TM = VFNMS(LDK(KP866025403), TJ, TI); | |
106 Tv = VSUB(Tr, Tu); | |
107 TD = VADD(Tr, Tu); | |
108 { | |
109 V TN, TO, TP, TQ; | |
110 TN = VADD(Tn, To); | |
111 STM2(&(xo[0]), TN, ovs, &(xo[0])); | |
112 TO = VSUB(Tn, To); | |
113 STM2(&(xo[12]), TO, ovs, &(xo[0])); | |
114 TP = VFMAI(Tm, Tb); | |
115 STM2(&(xo[18]), TP, ovs, &(xo[2])); | |
116 TQ = VFNMSI(Tm, Tb); | |
117 STM2(&(xo[6]), TQ, ovs, &(xo[2])); | |
118 { | |
119 V TR, TS, TT, TU; | |
120 TR = VFMAI(TM, TL); | |
121 STM2(&(xo[10]), TR, ovs, &(xo[2])); | |
122 TS = VFNMSI(TM, TL); | |
123 STM2(&(xo[14]), TS, ovs, &(xo[2])); | |
124 STN2(&(xo[12]), TO, TS, ovs); | |
125 TT = VFNMSI(TK, TH); | |
126 STM2(&(xo[22]), TT, ovs, &(xo[2])); | |
127 TU = VFMAI(TK, TH); | |
128 STM2(&(xo[2]), TU, ovs, &(xo[2])); | |
129 STN2(&(xo[0]), TN, TU, ovs); | |
130 { | |
131 V TV, TW, TX, TY; | |
132 TV = VFNMSI(TE, TD); | |
133 STM2(&(xo[16]), TV, ovs, &(xo[0])); | |
134 STN2(&(xo[16]), TV, TP, ovs); | |
135 TW = VFMAI(TE, TD); | |
136 STM2(&(xo[8]), TW, ovs, &(xo[0])); | |
137 STN2(&(xo[8]), TW, TR, ovs); | |
138 TX = VFMAI(TC, Tv); | |
139 STM2(&(xo[4]), TX, ovs, &(xo[0])); | |
140 STN2(&(xo[4]), TX, TQ, ovs); | |
141 TY = VFNMSI(TC, Tv); | |
142 STM2(&(xo[20]), TY, ovs, &(xo[0])); | |
143 STN2(&(xo[20]), TY, TT, ovs); | |
144 } | |
145 } | |
146 } | |
147 } | |
148 } | |
149 } | |
150 } | |
151 } | |
152 } | |
153 VLEAVE(); | |
154 } | |
155 | |
156 static const kdft_desc desc = { 12, XSIMD_STRING("n2bv_12"), {30, 2, 18, 0}, &GENUS, 0, 2, 0, 0 }; | |
157 | |
158 void XSIMD(codelet_n2bv_12) (planner *p) { | |
159 X(kdft_register) (p, n2bv_12, &desc); | |
160 } | |
161 | |
162 #else /* HAVE_FMA */ | |
163 | |
164 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 12 -name n2bv_12 -with-ostride 2 -include n2b.h -store-multiple 2 */ | |
165 | |
166 /* | |
167 * This function contains 48 FP additions, 8 FP multiplications, | |
168 * (or, 44 additions, 4 multiplications, 4 fused multiply/add), | |
169 * 33 stack variables, 2 constants, and 30 memory accesses | |
170 */ | |
171 #include "n2b.h" | |
172 | |
173 static void n2bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) | |
174 { | |
175 DVK(KP866025403, +0.866025403784438646763723170752936183471402627); | |
176 DVK(KP500000000, +0.500000000000000000000000000000000000000000000); | |
177 { | |
178 INT i; | |
179 const R *xi; | |
180 R *xo; | |
181 xi = ii; | |
182 xo = io; | |
183 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) { | |
184 V T5, Ta, TG, TF, Ty, Tm, Ti, Tp, TJ, TI, Tx, Ts; | |
185 { | |
186 V T1, T6, T4, Tk, T9, Tl; | |
187 T1 = LD(&(xi[0]), ivs, &(xi[0])); | |
188 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); | |
189 { | |
190 V T2, T3, T7, T8; | |
191 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); | |
192 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); | |
193 T4 = VADD(T2, T3); | |
194 Tk = VSUB(T2, T3); | |
195 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0])); | |
196 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); | |
197 T9 = VADD(T7, T8); | |
198 Tl = VSUB(T7, T8); | |
199 } | |
200 T5 = VFNMS(LDK(KP500000000), T4, T1); | |
201 Ta = VFNMS(LDK(KP500000000), T9, T6); | |
202 TG = VADD(T6, T9); | |
203 TF = VADD(T1, T4); | |
204 Ty = VADD(Tk, Tl); | |
205 Tm = VMUL(LDK(KP866025403), VSUB(Tk, Tl)); | |
206 } | |
207 { | |
208 V Tn, Tq, Te, To, Th, Tr; | |
209 Tn = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); | |
210 Tq = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); | |
211 { | |
212 V Tc, Td, Tf, Tg; | |
213 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); | |
214 Td = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)])); | |
215 Te = VSUB(Tc, Td); | |
216 To = VADD(Tc, Td); | |
217 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); | |
218 Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); | |
219 Th = VSUB(Tf, Tg); | |
220 Tr = VADD(Tf, Tg); | |
221 } | |
222 Ti = VMUL(LDK(KP866025403), VSUB(Te, Th)); | |
223 Tp = VFNMS(LDK(KP500000000), To, Tn); | |
224 TJ = VADD(Tq, Tr); | |
225 TI = VADD(Tn, To); | |
226 Tx = VADD(Te, Th); | |
227 Ts = VFNMS(LDK(KP500000000), Tr, Tq); | |
228 } | |
229 { | |
230 V TN, TO, TP, TQ, TR, TS; | |
231 { | |
232 V TH, TK, TL, TM; | |
233 TH = VSUB(TF, TG); | |
234 TK = VBYI(VSUB(TI, TJ)); | |
235 TN = VSUB(TH, TK); | |
236 STM2(&(xo[6]), TN, ovs, &(xo[2])); | |
237 TO = VADD(TH, TK); | |
238 STM2(&(xo[18]), TO, ovs, &(xo[2])); | |
239 TL = VADD(TF, TG); | |
240 TM = VADD(TI, TJ); | |
241 TP = VSUB(TL, TM); | |
242 STM2(&(xo[12]), TP, ovs, &(xo[0])); | |
243 TQ = VADD(TL, TM); | |
244 STM2(&(xo[0]), TQ, ovs, &(xo[0])); | |
245 } | |
246 { | |
247 V Tj, Tv, Tu, Tw, Tb, Tt, TT, TU; | |
248 Tb = VSUB(T5, Ta); | |
249 Tj = VSUB(Tb, Ti); | |
250 Tv = VADD(Tb, Ti); | |
251 Tt = VSUB(Tp, Ts); | |
252 Tu = VBYI(VADD(Tm, Tt)); | |
253 Tw = VBYI(VSUB(Tt, Tm)); | |
254 TR = VSUB(Tj, Tu); | |
255 STM2(&(xo[22]), TR, ovs, &(xo[2])); | |
256 TS = VADD(Tv, Tw); | |
257 STM2(&(xo[10]), TS, ovs, &(xo[2])); | |
258 TT = VADD(Tj, Tu); | |
259 STM2(&(xo[2]), TT, ovs, &(xo[2])); | |
260 STN2(&(xo[0]), TQ, TT, ovs); | |
261 TU = VSUB(Tv, Tw); | |
262 STM2(&(xo[14]), TU, ovs, &(xo[2])); | |
263 STN2(&(xo[12]), TP, TU, ovs); | |
264 } | |
265 { | |
266 V Tz, TD, TC, TE, TA, TB; | |
267 Tz = VBYI(VMUL(LDK(KP866025403), VSUB(Tx, Ty))); | |
268 TD = VBYI(VMUL(LDK(KP866025403), VADD(Ty, Tx))); | |
269 TA = VADD(T5, Ta); | |
270 TB = VADD(Tp, Ts); | |
271 TC = VSUB(TA, TB); | |
272 TE = VADD(TA, TB); | |
273 { | |
274 V TV, TW, TX, TY; | |
275 TV = VADD(Tz, TC); | |
276 STM2(&(xo[4]), TV, ovs, &(xo[0])); | |
277 STN2(&(xo[4]), TV, TN, ovs); | |
278 TW = VSUB(TE, TD); | |
279 STM2(&(xo[16]), TW, ovs, &(xo[0])); | |
280 STN2(&(xo[16]), TW, TO, ovs); | |
281 TX = VSUB(TC, Tz); | |
282 STM2(&(xo[20]), TX, ovs, &(xo[0])); | |
283 STN2(&(xo[20]), TX, TR, ovs); | |
284 TY = VADD(TD, TE); | |
285 STM2(&(xo[8]), TY, ovs, &(xo[0])); | |
286 STN2(&(xo[8]), TY, TS, ovs); | |
287 } | |
288 } | |
289 } | |
290 } | |
291 } | |
292 VLEAVE(); | |
293 } | |
294 | |
295 static const kdft_desc desc = { 12, XSIMD_STRING("n2bv_12"), {44, 4, 4, 0}, &GENUS, 0, 2, 0, 0 }; | |
296 | |
297 void XSIMD(codelet_n2bv_12) (planner *p) { | |
298 X(kdft_register) (p, n2bv_12, &desc); | |
299 } | |
300 | |
301 #endif /* HAVE_FMA */ |