Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.8/rdft/simd/common/hc2cbdftv_16.c @ 167:bd3cc4d1df30
Add FFTW 3.3.8 source, and a Linux build
author | Chris Cannam <cannam@all-day-breakfast.com> |
---|---|
date | Tue, 19 Nov 2019 14:52:55 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
166:cbd6d7e562c7 | 167:bd3cc4d1df30 |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-14 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 /* This file was automatically generated --- DO NOT EDIT */ | |
22 /* Generated on Thu May 24 08:08:12 EDT 2018 */ | |
23 | |
24 #include "rdft/codelet-rdft.h" | |
25 | |
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) | |
27 | |
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dif -sign 1 -name hc2cbdftv_16 -include rdft/simd/hc2cbv.h */ | |
29 | |
30 /* | |
31 * This function contains 103 FP additions, 80 FP multiplications, | |
32 * (or, 53 additions, 30 multiplications, 50 fused multiply/add), | |
33 * 79 stack variables, 3 constants, and 32 memory accesses | |
34 */ | |
35 #include "rdft/simd/hc2cbv.h" | |
36 | |
37 static void hc2cbdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) | |
38 { | |
39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626); | |
40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875); | |
41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938); | |
42 { | |
43 INT m; | |
44 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) { | |
45 V T8, Tv, TE, T1t, TP, T1w, T10, T1p, Tn, Tw, T13, T1q, TL, T1x, TS; | |
46 V T1u; | |
47 { | |
48 V T4, TA, Tu, TC, T7, TN, Tr, TB, T2, T3, Ts, Tt, T5, T6, Tp; | |
49 V Tq, TD, TO, TY, TZ, Tb, TF, Tl, TJ, Te, TG, Ti, TI, T9, Ta; | |
50 V Tj, Tk, Tc, Td, Tg, Th, Tf, Tm, T11, T12, TH, TK, TQ, TR; | |
51 T2 = LD(&(Rp[0]), ms, &(Rp[0])); | |
52 T3 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)])); | |
53 T4 = VFMACONJ(T3, T2); | |
54 TA = VFNMSCONJ(T3, T2); | |
55 Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0])); | |
56 Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)])); | |
57 Tu = VFMACONJ(Tt, Ts); | |
58 TC = VFMSCONJ(Tt, Ts); | |
59 T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0])); | |
60 T6 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)])); | |
61 T7 = VFMACONJ(T6, T5); | |
62 TN = VFNMSCONJ(T6, T5); | |
63 Tp = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0])); | |
64 Tq = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)])); | |
65 Tr = VFMACONJ(Tq, Tp); | |
66 TB = VFNMSCONJ(Tq, Tp); | |
67 T8 = VSUB(T4, T7); | |
68 Tv = VSUB(Tr, Tu); | |
69 TD = VADD(TB, TC); | |
70 TE = VFMA(LDK(KP707106781), TD, TA); | |
71 T1t = VFNMS(LDK(KP707106781), TD, TA); | |
72 TO = VSUB(TB, TC); | |
73 TP = VFMA(LDK(KP707106781), TO, TN); | |
74 T1w = VFNMS(LDK(KP707106781), TO, TN); | |
75 TY = VADD(T4, T7); | |
76 TZ = VADD(Tr, Tu); | |
77 T10 = VADD(TY, TZ); | |
78 T1p = VSUB(TY, TZ); | |
79 T9 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)])); | |
80 Ta = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0])); | |
81 Tb = VFMACONJ(Ta, T9); | |
82 TF = VFNMSCONJ(Ta, T9); | |
83 Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)])); | |
84 Tk = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0])); | |
85 Tl = VFMACONJ(Tk, Tj); | |
86 TJ = VFNMSCONJ(Tk, Tj); | |
87 Tc = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)])); | |
88 Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0])); | |
89 Te = VFMACONJ(Td, Tc); | |
90 TG = VFNMSCONJ(Td, Tc); | |
91 Tg = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)])); | |
92 Th = LD(&(Rm[0]), -ms, &(Rm[0])); | |
93 Ti = VFMACONJ(Th, Tg); | |
94 TI = VFMSCONJ(Th, Tg); | |
95 Tf = VSUB(Tb, Te); | |
96 Tm = VSUB(Ti, Tl); | |
97 Tn = VADD(Tf, Tm); | |
98 Tw = VSUB(Tf, Tm); | |
99 T11 = VADD(Tb, Te); | |
100 T12 = VADD(Ti, Tl); | |
101 T13 = VADD(T11, T12); | |
102 T1q = VSUB(T11, T12); | |
103 TH = VFNMS(LDK(KP414213562), TG, TF); | |
104 TK = VFMA(LDK(KP414213562), TJ, TI); | |
105 TL = VADD(TH, TK); | |
106 T1x = VSUB(TH, TK); | |
107 TQ = VFMA(LDK(KP414213562), TF, TG); | |
108 TR = VFNMS(LDK(KP414213562), TI, TJ); | |
109 TS = VADD(TQ, TR); | |
110 T1u = VSUB(TQ, TR); | |
111 } | |
112 { | |
113 V T1j, T1R, T1c, T1J, T1g, T1l, T1N, T1T, T1Q, T1a, T1b, T19, T1I, T1e, T1f; | |
114 V T1d, T1k, T1L, T1M, T1K, T1S, T1h, T1U, T1V, T1i, T1m, T1O, T1P, T1n, T14; | |
115 V T1r, Ty, T1D, TU, T16, T1z, T1F, TX, T1o, To, Tx, T1, T1C, TM, TT; | |
116 V Tz, T15, T1v, T1y, T1s, T1E, TV, T1G, T1H, TW, T17, T1A, T1B, T18; | |
117 T1j = VADD(T10, T13); | |
118 T1Q = LDW(&(W[TWVL * 22])); | |
119 T1R = VZMUL(T1Q, VFNMSI(T1q, T1p)); | |
120 T1a = VFMA(LDK(KP707106781), Tn, T8); | |
121 T1b = VFMA(LDK(KP707106781), Tw, Tv); | |
122 T19 = LDW(&(W[TWVL * 26])); | |
123 T1c = VZMUL(T19, VFNMSI(T1b, T1a)); | |
124 T1I = LDW(&(W[TWVL * 2])); | |
125 T1J = VZMUL(T1I, VFMAI(T1b, T1a)); | |
126 T1e = VFMA(LDK(KP923879532), TL, TE); | |
127 T1f = VFMA(LDK(KP923879532), TS, TP); | |
128 T1d = LDW(&(W[TWVL * 28])); | |
129 T1g = VZMULI(T1d, VFNMSI(T1f, T1e)); | |
130 T1k = LDW(&(W[0])); | |
131 T1l = VZMULI(T1k, VFMAI(T1f, T1e)); | |
132 T1L = VFMA(LDK(KP923879532), T1u, T1t); | |
133 T1M = VFNMS(LDK(KP923879532), T1x, T1w); | |
134 T1K = LDW(&(W[TWVL * 4])); | |
135 T1N = VZMULI(T1K, VFNMSI(T1M, T1L)); | |
136 T1S = LDW(&(W[TWVL * 24])); | |
137 T1T = VZMULI(T1S, VFMAI(T1M, T1L)); | |
138 T1h = VCONJ(VSUB(T1c, T1g)); | |
139 ST(&(Rm[WS(rs, 7)]), T1h, -ms, &(Rm[WS(rs, 1)])); | |
140 T1U = VCONJ(VSUB(T1R, T1T)); | |
141 ST(&(Rm[WS(rs, 6)]), T1U, -ms, &(Rm[0])); | |
142 T1V = VADD(T1R, T1T); | |
143 ST(&(Rp[WS(rs, 6)]), T1V, ms, &(Rp[0])); | |
144 T1i = VADD(T1c, T1g); | |
145 ST(&(Rp[WS(rs, 7)]), T1i, ms, &(Rp[WS(rs, 1)])); | |
146 T1m = VCONJ(VSUB(T1j, T1l)); | |
147 ST(&(Rm[0]), T1m, -ms, &(Rm[0])); | |
148 T1O = VCONJ(VSUB(T1J, T1N)); | |
149 ST(&(Rm[WS(rs, 1)]), T1O, -ms, &(Rm[WS(rs, 1)])); | |
150 T1P = VADD(T1J, T1N); | |
151 ST(&(Rp[WS(rs, 1)]), T1P, ms, &(Rp[WS(rs, 1)])); | |
152 T1n = VADD(T1j, T1l); | |
153 ST(&(Rp[0]), T1n, ms, &(Rp[0])); | |
154 TX = LDW(&(W[TWVL * 14])); | |
155 T14 = VZMUL(TX, VSUB(T10, T13)); | |
156 T1o = LDW(&(W[TWVL * 6])); | |
157 T1r = VZMUL(T1o, VFMAI(T1q, T1p)); | |
158 To = VFNMS(LDK(KP707106781), Tn, T8); | |
159 Tx = VFNMS(LDK(KP707106781), Tw, Tv); | |
160 T1 = LDW(&(W[TWVL * 10])); | |
161 Ty = VZMUL(T1, VFNMSI(Tx, To)); | |
162 T1C = LDW(&(W[TWVL * 18])); | |
163 T1D = VZMUL(T1C, VFMAI(Tx, To)); | |
164 TM = VFNMS(LDK(KP923879532), TL, TE); | |
165 TT = VFNMS(LDK(KP923879532), TS, TP); | |
166 Tz = LDW(&(W[TWVL * 12])); | |
167 TU = VZMULI(Tz, VFNMSI(TT, TM)); | |
168 T15 = LDW(&(W[TWVL * 16])); | |
169 T16 = VZMULI(T15, VFMAI(TT, TM)); | |
170 T1v = VFNMS(LDK(KP923879532), T1u, T1t); | |
171 T1y = VFMA(LDK(KP923879532), T1x, T1w); | |
172 T1s = LDW(&(W[TWVL * 8])); | |
173 T1z = VZMULI(T1s, VFMAI(T1y, T1v)); | |
174 T1E = LDW(&(W[TWVL * 20])); | |
175 T1F = VZMULI(T1E, VFNMSI(T1y, T1v)); | |
176 TV = VCONJ(VSUB(Ty, TU)); | |
177 ST(&(Rm[WS(rs, 3)]), TV, -ms, &(Rm[WS(rs, 1)])); | |
178 T1G = VCONJ(VSUB(T1D, T1F)); | |
179 ST(&(Rm[WS(rs, 5)]), T1G, -ms, &(Rm[WS(rs, 1)])); | |
180 T1H = VADD(T1D, T1F); | |
181 ST(&(Rp[WS(rs, 5)]), T1H, ms, &(Rp[WS(rs, 1)])); | |
182 TW = VADD(Ty, TU); | |
183 ST(&(Rp[WS(rs, 3)]), TW, ms, &(Rp[WS(rs, 1)])); | |
184 T17 = VCONJ(VSUB(T14, T16)); | |
185 ST(&(Rm[WS(rs, 4)]), T17, -ms, &(Rm[0])); | |
186 T1A = VCONJ(VSUB(T1r, T1z)); | |
187 ST(&(Rm[WS(rs, 2)]), T1A, -ms, &(Rm[0])); | |
188 T1B = VADD(T1r, T1z); | |
189 ST(&(Rp[WS(rs, 2)]), T1B, ms, &(Rp[0])); | |
190 T18 = VADD(T14, T16); | |
191 ST(&(Rp[WS(rs, 4)]), T18, ms, &(Rp[0])); | |
192 } | |
193 } | |
194 } | |
195 VLEAVE(); | |
196 } | |
197 | |
198 static const tw_instr twinstr[] = { | |
199 VTW(1, 1), | |
200 VTW(1, 2), | |
201 VTW(1, 3), | |
202 VTW(1, 4), | |
203 VTW(1, 5), | |
204 VTW(1, 6), | |
205 VTW(1, 7), | |
206 VTW(1, 8), | |
207 VTW(1, 9), | |
208 VTW(1, 10), | |
209 VTW(1, 11), | |
210 VTW(1, 12), | |
211 VTW(1, 13), | |
212 VTW(1, 14), | |
213 VTW(1, 15), | |
214 {TW_NEXT, VL, 0} | |
215 }; | |
216 | |
217 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cbdftv_16"), twinstr, &GENUS, {53, 30, 50, 0} }; | |
218 | |
219 void XSIMD(codelet_hc2cbdftv_16) (planner *p) { | |
220 X(khc2c_register) (p, hc2cbdftv_16, &desc, HC2C_VIA_DFT); | |
221 } | |
222 #else | |
223 | |
224 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dif -sign 1 -name hc2cbdftv_16 -include rdft/simd/hc2cbv.h */ | |
225 | |
226 /* | |
227 * This function contains 103 FP additions, 42 FP multiplications, | |
228 * (or, 99 additions, 38 multiplications, 4 fused multiply/add), | |
229 * 83 stack variables, 3 constants, and 32 memory accesses | |
230 */ | |
231 #include "rdft/simd/hc2cbv.h" | |
232 | |
233 static void hc2cbdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) | |
234 { | |
235 DVK(KP382683432, +0.382683432365089771728459984030398866761344562); | |
236 DVK(KP923879532, +0.923879532511286756128183189396788286822416626); | |
237 DVK(KP707106781, +0.707106781186547524400844362104849039284835938); | |
238 { | |
239 INT m; | |
240 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) { | |
241 V Tf, T16, TZ, T1C, TI, T1a, TV, T1D, T1F, T1G, Ty, T19, TC, T17, TS; | |
242 V T10; | |
243 { | |
244 V T2, TD, T4, TF, Tc, Tb, Td, T6, T8, T9, T3, TE, Ta, T7, T5; | |
245 V Te, TX, TY, TG, TH, TT, TU, Tj, TM, Tw, TQ, Tn, TN, Ts, TP; | |
246 V Tg, Ti, Th, Tt, Tv, Tu, Tk, Tm, Tl, Tr, Tq, Tp, To, Tx, TA; | |
247 V TB, TO, TR; | |
248 T2 = LD(&(Rp[0]), ms, &(Rp[0])); | |
249 TD = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0])); | |
250 T3 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)])); | |
251 T4 = VCONJ(T3); | |
252 TE = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)])); | |
253 TF = VCONJ(TE); | |
254 Tc = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0])); | |
255 Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)])); | |
256 Tb = VCONJ(Ta); | |
257 Td = VSUB(Tb, Tc); | |
258 T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0])); | |
259 T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)])); | |
260 T8 = VCONJ(T7); | |
261 T9 = VSUB(T6, T8); | |
262 T5 = VSUB(T2, T4); | |
263 Te = VMUL(LDK(KP707106781), VADD(T9, Td)); | |
264 Tf = VADD(T5, Te); | |
265 T16 = VSUB(T5, Te); | |
266 TX = VADD(T2, T4); | |
267 TY = VADD(TD, TF); | |
268 TZ = VSUB(TX, TY); | |
269 T1C = VADD(TX, TY); | |
270 TG = VSUB(TD, TF); | |
271 TH = VMUL(LDK(KP707106781), VSUB(T9, Td)); | |
272 TI = VADD(TG, TH); | |
273 T1a = VSUB(TH, TG); | |
274 TT = VADD(T6, T8); | |
275 TU = VADD(Tb, Tc); | |
276 TV = VSUB(TT, TU); | |
277 T1D = VADD(TT, TU); | |
278 Tg = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)])); | |
279 Th = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0])); | |
280 Ti = VCONJ(Th); | |
281 Tj = VSUB(Tg, Ti); | |
282 TM = VADD(Tg, Ti); | |
283 Tt = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)])); | |
284 Tu = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0])); | |
285 Tv = VCONJ(Tu); | |
286 Tw = VSUB(Tt, Tv); | |
287 TQ = VADD(Tt, Tv); | |
288 Tk = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)])); | |
289 Tl = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0])); | |
290 Tm = VCONJ(Tl); | |
291 Tn = VSUB(Tk, Tm); | |
292 TN = VADD(Tk, Tm); | |
293 Tr = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)])); | |
294 Tp = LD(&(Rm[0]), -ms, &(Rm[0])); | |
295 Tq = VCONJ(Tp); | |
296 Ts = VSUB(Tq, Tr); | |
297 TP = VADD(Tq, Tr); | |
298 T1F = VADD(TM, TN); | |
299 T1G = VADD(TP, TQ); | |
300 To = VFNMS(LDK(KP382683432), Tn, VMUL(LDK(KP923879532), Tj)); | |
301 Tx = VFMA(LDK(KP923879532), Ts, VMUL(LDK(KP382683432), Tw)); | |
302 Ty = VADD(To, Tx); | |
303 T19 = VSUB(To, Tx); | |
304 TA = VFMA(LDK(KP382683432), Tj, VMUL(LDK(KP923879532), Tn)); | |
305 TB = VFNMS(LDK(KP382683432), Ts, VMUL(LDK(KP923879532), Tw)); | |
306 TC = VADD(TA, TB); | |
307 T17 = VSUB(TA, TB); | |
308 TO = VSUB(TM, TN); | |
309 TR = VSUB(TP, TQ); | |
310 TS = VMUL(LDK(KP707106781), VSUB(TO, TR)); | |
311 T10 = VMUL(LDK(KP707106781), VADD(TO, TR)); | |
312 } | |
313 { | |
314 V T21, T1W, T1u, T20, T1I, T1O, TK, T1S, T12, T1e, T1k, T1A, T1o, T1w, T1c; | |
315 V T1M, T1U, T1V, T1T, T1s, T1t, T1r, T1Z, T1E, T1H, T1B, T1N, Tz, TJ, T1; | |
316 V T1R, TW, T11, TL, T1d, T1i, T1j, T1h, T1z, T1m, T1n, T1l, T1v, T18, T1b; | |
317 V T15, T1L, T13, T1g, T1X, T23, T14, T1f, T1Y, T22, T1p, T1y, T1J, T1Q, T1q; | |
318 V T1x, T1K, T1P; | |
319 T1U = VADD(T1C, T1D); | |
320 T1V = VADD(T1F, T1G); | |
321 T21 = VADD(T1U, T1V); | |
322 T1T = LDW(&(W[TWVL * 14])); | |
323 T1W = VZMUL(T1T, VSUB(T1U, T1V)); | |
324 T1s = VADD(Tf, Ty); | |
325 T1t = VBYI(VADD(TI, TC)); | |
326 T1r = LDW(&(W[TWVL * 28])); | |
327 T1u = VZMULI(T1r, VSUB(T1s, T1t)); | |
328 T1Z = LDW(&(W[0])); | |
329 T20 = VZMULI(T1Z, VADD(T1s, T1t)); | |
330 T1E = VSUB(T1C, T1D); | |
331 T1H = VBYI(VSUB(T1F, T1G)); | |
332 T1B = LDW(&(W[TWVL * 22])); | |
333 T1I = VZMUL(T1B, VSUB(T1E, T1H)); | |
334 T1N = LDW(&(W[TWVL * 6])); | |
335 T1O = VZMUL(T1N, VADD(T1E, T1H)); | |
336 Tz = VSUB(Tf, Ty); | |
337 TJ = VBYI(VSUB(TC, TI)); | |
338 T1 = LDW(&(W[TWVL * 12])); | |
339 TK = VZMULI(T1, VADD(Tz, TJ)); | |
340 T1R = LDW(&(W[TWVL * 16])); | |
341 T1S = VZMULI(T1R, VSUB(Tz, TJ)); | |
342 TW = VBYI(VSUB(TS, TV)); | |
343 T11 = VSUB(TZ, T10); | |
344 TL = LDW(&(W[TWVL * 10])); | |
345 T12 = VZMUL(TL, VADD(TW, T11)); | |
346 T1d = LDW(&(W[TWVL * 18])); | |
347 T1e = VZMUL(T1d, VSUB(T11, TW)); | |
348 T1i = VBYI(VADD(T1a, T19)); | |
349 T1j = VADD(T16, T17); | |
350 T1h = LDW(&(W[TWVL * 4])); | |
351 T1k = VZMULI(T1h, VADD(T1i, T1j)); | |
352 T1z = LDW(&(W[TWVL * 24])); | |
353 T1A = VZMULI(T1z, VSUB(T1j, T1i)); | |
354 T1m = VBYI(VADD(TV, TS)); | |
355 T1n = VADD(TZ, T10); | |
356 T1l = LDW(&(W[TWVL * 2])); | |
357 T1o = VZMUL(T1l, VADD(T1m, T1n)); | |
358 T1v = LDW(&(W[TWVL * 26])); | |
359 T1w = VZMUL(T1v, VSUB(T1n, T1m)); | |
360 T18 = VSUB(T16, T17); | |
361 T1b = VBYI(VSUB(T19, T1a)); | |
362 T15 = LDW(&(W[TWVL * 20])); | |
363 T1c = VZMULI(T15, VSUB(T18, T1b)); | |
364 T1L = LDW(&(W[TWVL * 8])); | |
365 T1M = VZMULI(T1L, VADD(T1b, T18)); | |
366 T13 = VADD(TK, T12); | |
367 ST(&(Rp[WS(rs, 3)]), T13, ms, &(Rp[WS(rs, 1)])); | |
368 T1g = VCONJ(VSUB(T1e, T1c)); | |
369 ST(&(Rm[WS(rs, 5)]), T1g, -ms, &(Rm[WS(rs, 1)])); | |
370 T1X = VADD(T1S, T1W); | |
371 ST(&(Rp[WS(rs, 4)]), T1X, ms, &(Rp[0])); | |
372 T23 = VCONJ(VSUB(T21, T20)); | |
373 ST(&(Rm[0]), T23, -ms, &(Rm[0])); | |
374 T14 = VCONJ(VSUB(T12, TK)); | |
375 ST(&(Rm[WS(rs, 3)]), T14, -ms, &(Rm[WS(rs, 1)])); | |
376 T1f = VADD(T1c, T1e); | |
377 ST(&(Rp[WS(rs, 5)]), T1f, ms, &(Rp[WS(rs, 1)])); | |
378 T1Y = VCONJ(VSUB(T1W, T1S)); | |
379 ST(&(Rm[WS(rs, 4)]), T1Y, -ms, &(Rm[0])); | |
380 T22 = VADD(T20, T21); | |
381 ST(&(Rp[0]), T22, ms, &(Rp[0])); | |
382 T1p = VADD(T1k, T1o); | |
383 ST(&(Rp[WS(rs, 1)]), T1p, ms, &(Rp[WS(rs, 1)])); | |
384 T1y = VCONJ(VSUB(T1w, T1u)); | |
385 ST(&(Rm[WS(rs, 7)]), T1y, -ms, &(Rm[WS(rs, 1)])); | |
386 T1J = VADD(T1A, T1I); | |
387 ST(&(Rp[WS(rs, 6)]), T1J, ms, &(Rp[0])); | |
388 T1Q = VCONJ(VSUB(T1O, T1M)); | |
389 ST(&(Rm[WS(rs, 2)]), T1Q, -ms, &(Rm[0])); | |
390 T1q = VCONJ(VSUB(T1o, T1k)); | |
391 ST(&(Rm[WS(rs, 1)]), T1q, -ms, &(Rm[WS(rs, 1)])); | |
392 T1x = VADD(T1u, T1w); | |
393 ST(&(Rp[WS(rs, 7)]), T1x, ms, &(Rp[WS(rs, 1)])); | |
394 T1K = VCONJ(VSUB(T1I, T1A)); | |
395 ST(&(Rm[WS(rs, 6)]), T1K, -ms, &(Rm[0])); | |
396 T1P = VADD(T1M, T1O); | |
397 ST(&(Rp[WS(rs, 2)]), T1P, ms, &(Rp[0])); | |
398 } | |
399 } | |
400 } | |
401 VLEAVE(); | |
402 } | |
403 | |
404 static const tw_instr twinstr[] = { | |
405 VTW(1, 1), | |
406 VTW(1, 2), | |
407 VTW(1, 3), | |
408 VTW(1, 4), | |
409 VTW(1, 5), | |
410 VTW(1, 6), | |
411 VTW(1, 7), | |
412 VTW(1, 8), | |
413 VTW(1, 9), | |
414 VTW(1, 10), | |
415 VTW(1, 11), | |
416 VTW(1, 12), | |
417 VTW(1, 13), | |
418 VTW(1, 14), | |
419 VTW(1, 15), | |
420 {TW_NEXT, VL, 0} | |
421 }; | |
422 | |
423 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cbdftv_16"), twinstr, &GENUS, {99, 38, 4, 0} }; | |
424 | |
425 void XSIMD(codelet_hc2cbdftv_16) (planner *p) { | |
426 X(khc2c_register) (p, hc2cbdftv_16, &desc, HC2C_VIA_DFT); | |
427 } | |
428 #endif |