Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.3/dft/scalar/codelets/n1_8.c @ 10:37bf6b4a2645
Add FFTW3
author | Chris Cannam |
---|---|
date | Wed, 20 Mar 2013 15:35:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:c0fb53affa76 | 10:37bf6b4a2645 |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-11 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 /* This file was automatically generated --- DO NOT EDIT */ | |
22 /* Generated on Sun Nov 25 07:35:42 EST 2012 */ | |
23 | |
24 #include "codelet-dft.h" | |
25 | |
26 #ifdef HAVE_FMA | |
27 | |
28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include n.h */ | |
29 | |
30 /* | |
31 * This function contains 52 FP additions, 8 FP multiplications, | |
32 * (or, 44 additions, 0 multiplications, 8 fused multiply/add), | |
33 * 36 stack variables, 1 constants, and 32 memory accesses | |
34 */ | |
35 #include "n.h" | |
36 | |
37 static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) | |
38 { | |
39 DK(KP707106781, +0.707106781186547524400844362104849039284835938); | |
40 { | |
41 INT i; | |
42 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) { | |
43 E TF, TE, TD, TI; | |
44 { | |
45 E Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, Tt; | |
46 E TM; | |
47 { | |
48 E T4, T5, Tj, Tk; | |
49 { | |
50 E T1, T2, Tg, Th; | |
51 T1 = ri[0]; | |
52 T2 = ri[WS(is, 4)]; | |
53 Tg = ii[0]; | |
54 Th = ii[WS(is, 4)]; | |
55 T4 = ri[WS(is, 2)]; | |
56 Tn = T1 - T2; | |
57 T3 = T1 + T2; | |
58 TC = Tg - Th; | |
59 Ti = Tg + Th; | |
60 T5 = ri[WS(is, 6)]; | |
61 } | |
62 Tj = ii[WS(is, 2)]; | |
63 Tk = ii[WS(is, 6)]; | |
64 { | |
65 E Tb, Tc, Tw, Tx; | |
66 Tb = ri[WS(is, 7)]; | |
67 TB = T4 - T5; | |
68 T6 = T4 + T5; | |
69 To = Tj - Tk; | |
70 Tl = Tj + Tk; | |
71 Tc = ri[WS(is, 3)]; | |
72 Tw = ii[WS(is, 7)]; | |
73 Tx = ii[WS(is, 3)]; | |
74 { | |
75 E T8, Tv, Ty, T9, Tr, Ts; | |
76 T8 = ri[WS(is, 1)]; | |
77 Td = Tb + Tc; | |
78 Tv = Tb - Tc; | |
79 TN = Tw + Tx; | |
80 Ty = Tw - Tx; | |
81 T9 = ri[WS(is, 5)]; | |
82 Tr = ii[WS(is, 1)]; | |
83 Ts = ii[WS(is, 5)]; | |
84 Tz = Tv - Ty; | |
85 TH = Tv + Ty; | |
86 Ta = T8 + T9; | |
87 Tq = T8 - T9; | |
88 Tt = Tr - Ts; | |
89 TM = Tr + Ts; | |
90 } | |
91 } | |
92 } | |
93 { | |
94 E TL, TG, Tu, Tf, Tm, TO; | |
95 { | |
96 E T7, Te, TP, TQ; | |
97 TL = T3 - T6; | |
98 T7 = T3 + T6; | |
99 TG = Tt - Tq; | |
100 Tu = Tq + Tt; | |
101 Te = Ta + Td; | |
102 Tf = Td - Ta; | |
103 Tm = Ti - Tl; | |
104 TP = Ti + Tl; | |
105 TQ = TM + TN; | |
106 TO = TM - TN; | |
107 ro[0] = T7 + Te; | |
108 ro[WS(os, 4)] = T7 - Te; | |
109 io[0] = TP + TQ; | |
110 io[WS(os, 4)] = TP - TQ; | |
111 } | |
112 { | |
113 E Tp, TA, TJ, TK; | |
114 TF = Tn - To; | |
115 Tp = Tn + To; | |
116 io[WS(os, 6)] = Tm - Tf; | |
117 io[WS(os, 2)] = Tf + Tm; | |
118 ro[WS(os, 2)] = TL + TO; | |
119 ro[WS(os, 6)] = TL - TO; | |
120 TA = Tu + Tz; | |
121 TE = Tz - Tu; | |
122 TD = TB + TC; | |
123 TJ = TC - TB; | |
124 TK = TG + TH; | |
125 TI = TG - TH; | |
126 ro[WS(os, 1)] = FMA(KP707106781, TA, Tp); | |
127 ro[WS(os, 5)] = FNMS(KP707106781, TA, Tp); | |
128 io[WS(os, 1)] = FMA(KP707106781, TK, TJ); | |
129 io[WS(os, 5)] = FNMS(KP707106781, TK, TJ); | |
130 } | |
131 } | |
132 } | |
133 io[WS(os, 3)] = FMA(KP707106781, TE, TD); | |
134 io[WS(os, 7)] = FNMS(KP707106781, TE, TD); | |
135 ro[WS(os, 3)] = FMA(KP707106781, TI, TF); | |
136 ro[WS(os, 7)] = FNMS(KP707106781, TI, TF); | |
137 } | |
138 } | |
139 } | |
140 | |
141 static const kdft_desc desc = { 8, "n1_8", {44, 0, 8, 0}, &GENUS, 0, 0, 0, 0 }; | |
142 | |
143 void X(codelet_n1_8) (planner *p) { | |
144 X(kdft_register) (p, n1_8, &desc); | |
145 } | |
146 | |
147 #else /* HAVE_FMA */ | |
148 | |
149 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include n.h */ | |
150 | |
151 /* | |
152 * This function contains 52 FP additions, 4 FP multiplications, | |
153 * (or, 52 additions, 4 multiplications, 0 fused multiply/add), | |
154 * 28 stack variables, 1 constants, and 32 memory accesses | |
155 */ | |
156 #include "n.h" | |
157 | |
158 static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) | |
159 { | |
160 DK(KP707106781, +0.707106781186547524400844362104849039284835938); | |
161 { | |
162 INT i; | |
163 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) { | |
164 E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu; | |
165 E TG; | |
166 { | |
167 E T1, T2, Tj, Tk; | |
168 T1 = ri[0]; | |
169 T2 = ri[WS(is, 4)]; | |
170 T3 = T1 + T2; | |
171 Tn = T1 - T2; | |
172 { | |
173 E Tg, Th, T4, T5; | |
174 Tg = ii[0]; | |
175 Th = ii[WS(is, 4)]; | |
176 Ti = Tg + Th; | |
177 TC = Tg - Th; | |
178 T4 = ri[WS(is, 2)]; | |
179 T5 = ri[WS(is, 6)]; | |
180 T6 = T4 + T5; | |
181 TB = T4 - T5; | |
182 } | |
183 Tj = ii[WS(is, 2)]; | |
184 Tk = ii[WS(is, 6)]; | |
185 Tl = Tj + Tk; | |
186 To = Tj - Tk; | |
187 { | |
188 E Tb, Tc, Tv, Tw, Tx, Ty; | |
189 Tb = ri[WS(is, 7)]; | |
190 Tc = ri[WS(is, 3)]; | |
191 Tv = Tb - Tc; | |
192 Tw = ii[WS(is, 7)]; | |
193 Tx = ii[WS(is, 3)]; | |
194 Ty = Tw - Tx; | |
195 Td = Tb + Tc; | |
196 TN = Tw + Tx; | |
197 Tz = Tv - Ty; | |
198 TH = Tv + Ty; | |
199 } | |
200 { | |
201 E T8, T9, Tq, Tr, Ts, Tt; | |
202 T8 = ri[WS(is, 1)]; | |
203 T9 = ri[WS(is, 5)]; | |
204 Tq = T8 - T9; | |
205 Tr = ii[WS(is, 1)]; | |
206 Ts = ii[WS(is, 5)]; | |
207 Tt = Tr - Ts; | |
208 Ta = T8 + T9; | |
209 TM = Tr + Ts; | |
210 Tu = Tq + Tt; | |
211 TG = Tt - Tq; | |
212 } | |
213 } | |
214 { | |
215 E T7, Te, TP, TQ; | |
216 T7 = T3 + T6; | |
217 Te = Ta + Td; | |
218 ro[WS(os, 4)] = T7 - Te; | |
219 ro[0] = T7 + Te; | |
220 TP = Ti + Tl; | |
221 TQ = TM + TN; | |
222 io[WS(os, 4)] = TP - TQ; | |
223 io[0] = TP + TQ; | |
224 } | |
225 { | |
226 E Tf, Tm, TL, TO; | |
227 Tf = Td - Ta; | |
228 Tm = Ti - Tl; | |
229 io[WS(os, 2)] = Tf + Tm; | |
230 io[WS(os, 6)] = Tm - Tf; | |
231 TL = T3 - T6; | |
232 TO = TM - TN; | |
233 ro[WS(os, 6)] = TL - TO; | |
234 ro[WS(os, 2)] = TL + TO; | |
235 } | |
236 { | |
237 E Tp, TA, TJ, TK; | |
238 Tp = Tn + To; | |
239 TA = KP707106781 * (Tu + Tz); | |
240 ro[WS(os, 5)] = Tp - TA; | |
241 ro[WS(os, 1)] = Tp + TA; | |
242 TJ = TC - TB; | |
243 TK = KP707106781 * (TG + TH); | |
244 io[WS(os, 5)] = TJ - TK; | |
245 io[WS(os, 1)] = TJ + TK; | |
246 } | |
247 { | |
248 E TD, TE, TF, TI; | |
249 TD = TB + TC; | |
250 TE = KP707106781 * (Tz - Tu); | |
251 io[WS(os, 7)] = TD - TE; | |
252 io[WS(os, 3)] = TD + TE; | |
253 TF = Tn - To; | |
254 TI = KP707106781 * (TG - TH); | |
255 ro[WS(os, 7)] = TF - TI; | |
256 ro[WS(os, 3)] = TF + TI; | |
257 } | |
258 } | |
259 } | |
260 } | |
261 | |
262 static const kdft_desc desc = { 8, "n1_8", {52, 4, 0, 0}, &GENUS, 0, 0, 0, 0 }; | |
263 | |
264 void X(codelet_n1_8) (planner *p) { | |
265 X(kdft_register) (p, n1_8, &desc); | |
266 } | |
267 | |
268 #endif /* HAVE_FMA */ |