comparison src/fftw-3.3.8/rdft/simd/common/hc2cfdftv_16.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:08:11 EDT 2018 */
23
24 #include "rdft/codelet-rdft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dit -name hc2cfdftv_16 -include rdft/simd/hc2cfv.h */
29
30 /*
31 * This function contains 103 FP additions, 96 FP multiplications,
32 * (or, 53 additions, 46 multiplications, 50 fused multiply/add),
33 * 92 stack variables, 4 constants, and 32 memory accesses
34 */
35 #include "rdft/simd/hc2cfv.h"
36
37 static void hc2cfdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
40 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
41 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
42 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
43 {
44 INT m;
45 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
46 V T8, TZ, TH, T12, T1q, T1I, T1x, T1J, Tr, T10, T1A, T1K, TS, T13, T1t;
47 V T1N, T3, Tw, TF, TW, T7, Tu, TB, TY, T1, T2, Tv, TD, TE, TC;
48 V TV, T5, T6, T4, Tt, Tz, TA, Ty, TX, Tx, TG, T1o, T1p, T1v, T1w;
49 V T1C, T1D, T1u, T1B, T1G, T1H, T1E, T1F;
50 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
51 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
52 T3 = VFMACONJ(T2, T1);
53 Tv = LDW(&(W[0]));
54 Tw = VZMULIJ(Tv, VFNMSCONJ(T2, T1));
55 TD = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
56 TE = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
57 TC = LDW(&(W[TWVL * 8]));
58 TF = VZMULIJ(TC, VFNMSCONJ(TE, TD));
59 TV = LDW(&(W[TWVL * 6]));
60 TW = VZMULJ(TV, VFMACONJ(TE, TD));
61 T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
62 T6 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
63 T4 = LDW(&(W[TWVL * 14]));
64 T7 = VZMULJ(T4, VFMACONJ(T6, T5));
65 Tt = LDW(&(W[TWVL * 16]));
66 Tu = VZMULIJ(Tt, VFNMSCONJ(T6, T5));
67 Tz = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
68 TA = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
69 Ty = LDW(&(W[TWVL * 24]));
70 TB = VZMULIJ(Ty, VFNMSCONJ(TA, Tz));
71 TX = LDW(&(W[TWVL * 22]));
72 TY = VZMULJ(TX, VFMACONJ(TA, Tz));
73 T8 = VSUB(T3, T7);
74 TZ = VSUB(TW, TY);
75 Tx = VSUB(Tu, Tw);
76 TG = VSUB(TB, TF);
77 TH = VFNMS(LDK(KP414213562), TG, Tx);
78 T12 = VFMA(LDK(KP414213562), Tx, TG);
79 T1o = VADD(T3, T7);
80 T1p = VADD(TW, TY);
81 T1q = VADD(T1o, T1p);
82 T1I = VSUB(T1o, T1p);
83 T1v = VADD(Tw, Tu);
84 T1w = VADD(TF, TB);
85 T1x = VADD(T1v, T1w);
86 T1J = VSUB(T1w, T1v);
87 {
88 V Tc, TQ, Tp, TJ, Tg, TO, Tl, TL, Ta, Tb, T9, TP, Tn, To, Tm;
89 V TI, Te, Tf, Td, TN, Tj, Tk, Ti, TK, Th, Tq, T1y, T1z, TM, TR;
90 V T1r, T1s;
91 Ta = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
92 Tb = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
93 T9 = LDW(&(W[TWVL * 2]));
94 Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
95 TP = LDW(&(W[TWVL * 4]));
96 TQ = VZMULIJ(TP, VFNMSCONJ(Tb, Ta));
97 Tn = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
98 To = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
99 Tm = LDW(&(W[TWVL * 10]));
100 Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
101 TI = LDW(&(W[TWVL * 12]));
102 TJ = VZMULIJ(TI, VFNMSCONJ(To, Tn));
103 Te = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
104 Tf = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
105 Td = LDW(&(W[TWVL * 18]));
106 Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
107 TN = LDW(&(W[TWVL * 20]));
108 TO = VZMULIJ(TN, VFNMSCONJ(Tf, Te));
109 Tj = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
110 Tk = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
111 Ti = LDW(&(W[TWVL * 26]));
112 Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
113 TK = LDW(&(W[TWVL * 28]));
114 TL = VZMULIJ(TK, VFNMSCONJ(Tk, Tj));
115 Th = VSUB(Tc, Tg);
116 Tq = VSUB(Tl, Tp);
117 Tr = VADD(Th, Tq);
118 T10 = VSUB(Tq, Th);
119 T1y = VADD(TQ, TO);
120 T1z = VADD(TL, TJ);
121 T1A = VADD(T1y, T1z);
122 T1K = VSUB(T1y, T1z);
123 TM = VSUB(TJ, TL);
124 TR = VSUB(TO, TQ);
125 TS = VFMA(LDK(KP414213562), TR, TM);
126 T13 = VFNMS(LDK(KP414213562), TM, TR);
127 T1r = VADD(Tc, Tg);
128 T1s = VADD(Tl, Tp);
129 T1t = VADD(T1r, T1s);
130 T1N = VSUB(T1s, T1r);
131 }
132 T1u = VSUB(T1q, T1t);
133 T1B = VSUB(T1x, T1A);
134 T1C = VMUL(LDK(KP500000000), VFMAI(T1B, T1u));
135 T1D = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1B, T1u)));
136 ST(&(Rp[WS(rs, 4)]), T1C, ms, &(Rp[0]));
137 ST(&(Rm[WS(rs, 3)]), T1D, -ms, &(Rm[WS(rs, 1)]));
138 T1E = VADD(T1q, T1t);
139 T1F = VADD(T1x, T1A);
140 T1G = VMUL(LDK(KP500000000), VSUB(T1E, T1F));
141 T1H = VCONJ(VMUL(LDK(KP500000000), VADD(T1F, T1E)));
142 ST(&(Rp[0]), T1G, ms, &(Rp[0]));
143 ST(&(Rm[WS(rs, 7)]), T1H, -ms, &(Rm[WS(rs, 1)]));
144 {
145 V T1M, T1S, T1P, T1T, T1L, T1O, T1Q, T1V, T1R, T1U, TU, T18, T15, T19, Ts;
146 V TT, T11, T14, T16, T1b, T17, T1a, T1e, T1k, T1h, T1l, T1c, T1d, T1f, T1g;
147 V T1i, T1n, T1j, T1m;
148 T1L = VADD(T1J, T1K);
149 T1M = VFMA(LDK(KP707106781), T1L, T1I);
150 T1S = VFNMS(LDK(KP707106781), T1L, T1I);
151 T1O = VSUB(T1K, T1J);
152 T1P = VFMA(LDK(KP707106781), T1O, T1N);
153 T1T = VFNMS(LDK(KP707106781), T1O, T1N);
154 T1Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1P, T1M)));
155 ST(&(Rm[WS(rs, 1)]), T1Q, -ms, &(Rm[WS(rs, 1)]));
156 T1V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1T, T1S)));
157 ST(&(Rm[WS(rs, 5)]), T1V, -ms, &(Rm[WS(rs, 1)]));
158 T1R = VMUL(LDK(KP500000000), VFMAI(T1P, T1M));
159 ST(&(Rp[WS(rs, 2)]), T1R, ms, &(Rp[0]));
160 T1U = VMUL(LDK(KP500000000), VFNMSI(T1T, T1S));
161 ST(&(Rp[WS(rs, 6)]), T1U, ms, &(Rp[0]));
162 Ts = VFMA(LDK(KP707106781), Tr, T8);
163 TT = VADD(TH, TS);
164 TU = VFMA(LDK(KP923879532), TT, Ts);
165 T18 = VFNMS(LDK(KP923879532), TT, Ts);
166 T11 = VFNMS(LDK(KP707106781), T10, TZ);
167 T14 = VADD(T12, T13);
168 T15 = VFMA(LDK(KP923879532), T14, T11);
169 T19 = VFNMS(LDK(KP923879532), T14, T11);
170 T16 = VMUL(LDK(KP500000000), VFNMSI(T15, TU));
171 ST(&(Rp[WS(rs, 1)]), T16, ms, &(Rp[WS(rs, 1)]));
172 T1b = VMUL(LDK(KP500000000), VFMAI(T19, T18));
173 ST(&(Rp[WS(rs, 7)]), T1b, ms, &(Rp[WS(rs, 1)]));
174 T17 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T15, TU)));
175 ST(&(Rm[0]), T17, -ms, &(Rm[0]));
176 T1a = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T19, T18)));
177 ST(&(Rm[WS(rs, 6)]), T1a, -ms, &(Rm[0]));
178 T1c = VFNMS(LDK(KP707106781), Tr, T8);
179 T1d = VSUB(T12, T13);
180 T1e = VFMA(LDK(KP923879532), T1d, T1c);
181 T1k = VFNMS(LDK(KP923879532), T1d, T1c);
182 T1f = VFMA(LDK(KP707106781), T10, TZ);
183 T1g = VSUB(TS, TH);
184 T1h = VFMA(LDK(KP923879532), T1g, T1f);
185 T1l = VFNMS(LDK(KP923879532), T1g, T1f);
186 T1i = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1h, T1e)));
187 ST(&(Rm[WS(rs, 2)]), T1i, -ms, &(Rm[0]));
188 T1n = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1l, T1k)));
189 ST(&(Rm[WS(rs, 4)]), T1n, -ms, &(Rm[0]));
190 T1j = VMUL(LDK(KP500000000), VFMAI(T1h, T1e));
191 ST(&(Rp[WS(rs, 3)]), T1j, ms, &(Rp[WS(rs, 1)]));
192 T1m = VMUL(LDK(KP500000000), VFNMSI(T1l, T1k));
193 ST(&(Rp[WS(rs, 5)]), T1m, ms, &(Rp[WS(rs, 1)]));
194 }
195 }
196 }
197 VLEAVE();
198 }
199
200 static const tw_instr twinstr[] = {
201 VTW(1, 1),
202 VTW(1, 2),
203 VTW(1, 3),
204 VTW(1, 4),
205 VTW(1, 5),
206 VTW(1, 6),
207 VTW(1, 7),
208 VTW(1, 8),
209 VTW(1, 9),
210 VTW(1, 10),
211 VTW(1, 11),
212 VTW(1, 12),
213 VTW(1, 13),
214 VTW(1, 14),
215 VTW(1, 15),
216 {TW_NEXT, VL, 0}
217 };
218
219 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cfdftv_16"), twinstr, &GENUS, {53, 46, 50, 0} };
220
221 void XSIMD(codelet_hc2cfdftv_16) (planner *p) {
222 X(khc2c_register) (p, hc2cfdftv_16, &desc, HC2C_VIA_DFT);
223 }
224 #else
225
226 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dit -name hc2cfdftv_16 -include rdft/simd/hc2cfv.h */
227
228 /*
229 * This function contains 103 FP additions, 56 FP multiplications,
230 * (or, 99 additions, 52 multiplications, 4 fused multiply/add),
231 * 101 stack variables, 5 constants, and 32 memory accesses
232 */
233 #include "rdft/simd/hc2cfv.h"
234
235 static void hc2cfdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
236 {
237 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
238 DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
239 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
240 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
241 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
242 {
243 INT m;
244 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
245 V T1D, T1E, T1R, TP, T1b, Ta, T1w, T18, T1x, T1z, T1A, T1G, T1H, T1S, Tx;
246 V T13, T10, T1a, T1, T3, TA, TM, TL, TN, T6, T8, TC, TH, TG, TI;
247 V T2, Tz, TK, TJ, T7, TB, TF, TE, TD, TO, T4, T9, T5, T15, T17;
248 V T14, T16;
249 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
250 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
251 T3 = VCONJ(T2);
252 Tz = LDW(&(W[0]));
253 TA = VZMULIJ(Tz, VSUB(T3, T1));
254 TM = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
255 TK = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
256 TL = VCONJ(TK);
257 TJ = LDW(&(W[TWVL * 24]));
258 TN = VZMULIJ(TJ, VSUB(TL, TM));
259 T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
260 T7 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
261 T8 = VCONJ(T7);
262 TB = LDW(&(W[TWVL * 16]));
263 TC = VZMULIJ(TB, VSUB(T8, T6));
264 TH = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
265 TF = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
266 TG = VCONJ(TF);
267 TE = LDW(&(W[TWVL * 8]));
268 TI = VZMULIJ(TE, VSUB(TG, TH));
269 T1D = VADD(TA, TC);
270 T1E = VADD(TI, TN);
271 T1R = VSUB(T1D, T1E);
272 TD = VSUB(TA, TC);
273 TO = VSUB(TI, TN);
274 TP = VFNMS(LDK(KP382683432), TO, VMUL(LDK(KP923879532), TD));
275 T1b = VFMA(LDK(KP382683432), TD, VMUL(LDK(KP923879532), TO));
276 T4 = VADD(T1, T3);
277 T5 = LDW(&(W[TWVL * 14]));
278 T9 = VZMULJ(T5, VADD(T6, T8));
279 Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
280 T1w = VADD(T4, T9);
281 T14 = LDW(&(W[TWVL * 6]));
282 T15 = VZMULJ(T14, VADD(TH, TG));
283 T16 = LDW(&(W[TWVL * 22]));
284 T17 = VZMULJ(T16, VADD(TM, TL));
285 T18 = VSUB(T15, T17);
286 T1x = VADD(T15, T17);
287 {
288 V Tf, TR, Tv, TY, Tk, TT, Tq, TW, Tc, Te, Td, Tb, TQ, Ts, Tu;
289 V Tt, Tr, TX, Th, Tj, Ti, Tg, TS, Tn, Tp, To, Tm, TV, Tl, Tw;
290 V TU, TZ;
291 Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
292 Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
293 Te = VCONJ(Td);
294 Tb = LDW(&(W[TWVL * 2]));
295 Tf = VZMULJ(Tb, VADD(Tc, Te));
296 TQ = LDW(&(W[TWVL * 4]));
297 TR = VZMULIJ(TQ, VSUB(Te, Tc));
298 Ts = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
299 Tt = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
300 Tu = VCONJ(Tt);
301 Tr = LDW(&(W[TWVL * 10]));
302 Tv = VZMULJ(Tr, VADD(Ts, Tu));
303 TX = LDW(&(W[TWVL * 12]));
304 TY = VZMULIJ(TX, VSUB(Tu, Ts));
305 Th = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
306 Ti = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
307 Tj = VCONJ(Ti);
308 Tg = LDW(&(W[TWVL * 18]));
309 Tk = VZMULJ(Tg, VADD(Th, Tj));
310 TS = LDW(&(W[TWVL * 20]));
311 TT = VZMULIJ(TS, VSUB(Tj, Th));
312 Tn = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
313 To = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
314 Tp = VCONJ(To);
315 Tm = LDW(&(W[TWVL * 26]));
316 Tq = VZMULJ(Tm, VADD(Tn, Tp));
317 TV = LDW(&(W[TWVL * 28]));
318 TW = VZMULIJ(TV, VSUB(Tp, Tn));
319 T1z = VADD(Tf, Tk);
320 T1A = VADD(Tq, Tv);
321 T1G = VADD(TR, TT);
322 T1H = VADD(TW, TY);
323 T1S = VSUB(T1H, T1G);
324 Tl = VSUB(Tf, Tk);
325 Tw = VSUB(Tq, Tv);
326 Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
327 T13 = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
328 TU = VSUB(TR, TT);
329 TZ = VSUB(TW, TY);
330 T10 = VFMA(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TZ));
331 T1a = VFNMS(LDK(KP923879532), TU, VMUL(LDK(KP382683432), TZ));
332 }
333 {
334 V T1U, T20, T1X, T21, T1Q, T1T, T1V, T1W, T1Y, T23, T1Z, T22, T1C, T1M, T1J;
335 V T1N, T1y, T1B, T1F, T1I, T1K, T1P, T1L, T1O, T12, T1g, T1d, T1h, Ty, T11;
336 V T19, T1c, T1e, T1j, T1f, T1i, T1m, T1s, T1p, T1t, T1k, T1l, T1n, T1o, T1q;
337 V T1v, T1r, T1u;
338 T1Q = VMUL(LDK(KP500000000), VSUB(T1w, T1x));
339 T1T = VMUL(LDK(KP353553390), VADD(T1R, T1S));
340 T1U = VADD(T1Q, T1T);
341 T20 = VSUB(T1Q, T1T);
342 T1V = VSUB(T1A, T1z);
343 T1W = VMUL(LDK(KP707106781), VSUB(T1S, T1R));
344 T1X = VMUL(LDK(KP500000000), VBYI(VADD(T1V, T1W)));
345 T21 = VMUL(LDK(KP500000000), VBYI(VSUB(T1W, T1V)));
346 T1Y = VCONJ(VSUB(T1U, T1X));
347 ST(&(Rm[WS(rs, 1)]), T1Y, -ms, &(Rm[WS(rs, 1)]));
348 T23 = VADD(T20, T21);
349 ST(&(Rp[WS(rs, 6)]), T23, ms, &(Rp[0]));
350 T1Z = VADD(T1U, T1X);
351 ST(&(Rp[WS(rs, 2)]), T1Z, ms, &(Rp[0]));
352 T22 = VCONJ(VSUB(T20, T21));
353 ST(&(Rm[WS(rs, 5)]), T22, -ms, &(Rm[WS(rs, 1)]));
354 T1y = VADD(T1w, T1x);
355 T1B = VADD(T1z, T1A);
356 T1C = VADD(T1y, T1B);
357 T1M = VSUB(T1y, T1B);
358 T1F = VADD(T1D, T1E);
359 T1I = VADD(T1G, T1H);
360 T1J = VADD(T1F, T1I);
361 T1N = VBYI(VSUB(T1I, T1F));
362 T1K = VCONJ(VMUL(LDK(KP500000000), VSUB(T1C, T1J)));
363 ST(&(Rm[WS(rs, 7)]), T1K, -ms, &(Rm[WS(rs, 1)]));
364 T1P = VMUL(LDK(KP500000000), VADD(T1M, T1N));
365 ST(&(Rp[WS(rs, 4)]), T1P, ms, &(Rp[0]));
366 T1L = VMUL(LDK(KP500000000), VADD(T1C, T1J));
367 ST(&(Rp[0]), T1L, ms, &(Rp[0]));
368 T1O = VCONJ(VMUL(LDK(KP500000000), VSUB(T1M, T1N)));
369 ST(&(Rm[WS(rs, 3)]), T1O, -ms, &(Rm[WS(rs, 1)]));
370 Ty = VADD(Ta, Tx);
371 T11 = VMUL(LDK(KP500000000), VADD(TP, T10));
372 T12 = VADD(Ty, T11);
373 T1g = VSUB(Ty, T11);
374 T19 = VSUB(T13, T18);
375 T1c = VSUB(T1a, T1b);
376 T1d = VMUL(LDK(KP500000000), VBYI(VADD(T19, T1c)));
377 T1h = VMUL(LDK(KP500000000), VBYI(VSUB(T1c, T19)));
378 T1e = VCONJ(VSUB(T12, T1d));
379 ST(&(Rm[0]), T1e, -ms, &(Rm[0]));
380 T1j = VADD(T1g, T1h);
381 ST(&(Rp[WS(rs, 7)]), T1j, ms, &(Rp[WS(rs, 1)]));
382 T1f = VADD(T12, T1d);
383 ST(&(Rp[WS(rs, 1)]), T1f, ms, &(Rp[WS(rs, 1)]));
384 T1i = VCONJ(VSUB(T1g, T1h));
385 ST(&(Rm[WS(rs, 6)]), T1i, -ms, &(Rm[0]));
386 T1k = VSUB(T10, TP);
387 T1l = VADD(T18, T13);
388 T1m = VMUL(LDK(KP500000000), VBYI(VSUB(T1k, T1l)));
389 T1s = VMUL(LDK(KP500000000), VBYI(VADD(T1l, T1k)));
390 T1n = VSUB(Ta, Tx);
391 T1o = VMUL(LDK(KP500000000), VADD(T1b, T1a));
392 T1p = VSUB(T1n, T1o);
393 T1t = VADD(T1n, T1o);
394 T1q = VADD(T1m, T1p);
395 ST(&(Rp[WS(rs, 5)]), T1q, ms, &(Rp[WS(rs, 1)]));
396 T1v = VCONJ(VSUB(T1t, T1s));
397 ST(&(Rm[WS(rs, 2)]), T1v, -ms, &(Rm[0]));
398 T1r = VCONJ(VSUB(T1p, T1m));
399 ST(&(Rm[WS(rs, 4)]), T1r, -ms, &(Rm[0]));
400 T1u = VADD(T1s, T1t);
401 ST(&(Rp[WS(rs, 3)]), T1u, ms, &(Rp[WS(rs, 1)]));
402 }
403 }
404 }
405 VLEAVE();
406 }
407
408 static const tw_instr twinstr[] = {
409 VTW(1, 1),
410 VTW(1, 2),
411 VTW(1, 3),
412 VTW(1, 4),
413 VTW(1, 5),
414 VTW(1, 6),
415 VTW(1, 7),
416 VTW(1, 8),
417 VTW(1, 9),
418 VTW(1, 10),
419 VTW(1, 11),
420 VTW(1, 12),
421 VTW(1, 13),
422 VTW(1, 14),
423 VTW(1, 15),
424 {TW_NEXT, VL, 0}
425 };
426
427 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cfdftv_16"), twinstr, &GENUS, {99, 52, 4, 0} };
428
429 void XSIMD(codelet_hc2cfdftv_16) (planner *p) {
430 X(khc2c_register) (p, hc2cfdftv_16, &desc, HC2C_VIA_DFT);
431 }
432 #endif