comparison src/fftw-3.3.8/rdft/simd/common/hc2cbdftv_16.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:08:12 EDT 2018 */
23
24 #include "rdft/codelet-rdft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dif -sign 1 -name hc2cbdftv_16 -include rdft/simd/hc2cbv.h */
29
30 /*
31 * This function contains 103 FP additions, 80 FP multiplications,
32 * (or, 53 additions, 30 multiplications, 50 fused multiply/add),
33 * 79 stack variables, 3 constants, and 32 memory accesses
34 */
35 #include "rdft/simd/hc2cbv.h"
36
37 static void hc2cbdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
42 {
43 INT m;
44 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
45 V T8, Tv, TE, T1t, TP, T1w, T10, T1p, Tn, Tw, T13, T1q, TL, T1x, TS;
46 V T1u;
47 {
48 V T4, TA, Tu, TC, T7, TN, Tr, TB, T2, T3, Ts, Tt, T5, T6, Tp;
49 V Tq, TD, TO, TY, TZ, Tb, TF, Tl, TJ, Te, TG, Ti, TI, T9, Ta;
50 V Tj, Tk, Tc, Td, Tg, Th, Tf, Tm, T11, T12, TH, TK, TQ, TR;
51 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
52 T3 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
53 T4 = VFMACONJ(T3, T2);
54 TA = VFNMSCONJ(T3, T2);
55 Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
56 Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
57 Tu = VFMACONJ(Tt, Ts);
58 TC = VFMSCONJ(Tt, Ts);
59 T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
60 T6 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
61 T7 = VFMACONJ(T6, T5);
62 TN = VFNMSCONJ(T6, T5);
63 Tp = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
64 Tq = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
65 Tr = VFMACONJ(Tq, Tp);
66 TB = VFNMSCONJ(Tq, Tp);
67 T8 = VSUB(T4, T7);
68 Tv = VSUB(Tr, Tu);
69 TD = VADD(TB, TC);
70 TE = VFMA(LDK(KP707106781), TD, TA);
71 T1t = VFNMS(LDK(KP707106781), TD, TA);
72 TO = VSUB(TB, TC);
73 TP = VFMA(LDK(KP707106781), TO, TN);
74 T1w = VFNMS(LDK(KP707106781), TO, TN);
75 TY = VADD(T4, T7);
76 TZ = VADD(Tr, Tu);
77 T10 = VADD(TY, TZ);
78 T1p = VSUB(TY, TZ);
79 T9 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
80 Ta = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
81 Tb = VFMACONJ(Ta, T9);
82 TF = VFNMSCONJ(Ta, T9);
83 Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
84 Tk = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
85 Tl = VFMACONJ(Tk, Tj);
86 TJ = VFNMSCONJ(Tk, Tj);
87 Tc = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
88 Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
89 Te = VFMACONJ(Td, Tc);
90 TG = VFNMSCONJ(Td, Tc);
91 Tg = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
92 Th = LD(&(Rm[0]), -ms, &(Rm[0]));
93 Ti = VFMACONJ(Th, Tg);
94 TI = VFMSCONJ(Th, Tg);
95 Tf = VSUB(Tb, Te);
96 Tm = VSUB(Ti, Tl);
97 Tn = VADD(Tf, Tm);
98 Tw = VSUB(Tf, Tm);
99 T11 = VADD(Tb, Te);
100 T12 = VADD(Ti, Tl);
101 T13 = VADD(T11, T12);
102 T1q = VSUB(T11, T12);
103 TH = VFNMS(LDK(KP414213562), TG, TF);
104 TK = VFMA(LDK(KP414213562), TJ, TI);
105 TL = VADD(TH, TK);
106 T1x = VSUB(TH, TK);
107 TQ = VFMA(LDK(KP414213562), TF, TG);
108 TR = VFNMS(LDK(KP414213562), TI, TJ);
109 TS = VADD(TQ, TR);
110 T1u = VSUB(TQ, TR);
111 }
112 {
113 V T1j, T1R, T1c, T1J, T1g, T1l, T1N, T1T, T1Q, T1a, T1b, T19, T1I, T1e, T1f;
114 V T1d, T1k, T1L, T1M, T1K, T1S, T1h, T1U, T1V, T1i, T1m, T1O, T1P, T1n, T14;
115 V T1r, Ty, T1D, TU, T16, T1z, T1F, TX, T1o, To, Tx, T1, T1C, TM, TT;
116 V Tz, T15, T1v, T1y, T1s, T1E, TV, T1G, T1H, TW, T17, T1A, T1B, T18;
117 T1j = VADD(T10, T13);
118 T1Q = LDW(&(W[TWVL * 22]));
119 T1R = VZMUL(T1Q, VFNMSI(T1q, T1p));
120 T1a = VFMA(LDK(KP707106781), Tn, T8);
121 T1b = VFMA(LDK(KP707106781), Tw, Tv);
122 T19 = LDW(&(W[TWVL * 26]));
123 T1c = VZMUL(T19, VFNMSI(T1b, T1a));
124 T1I = LDW(&(W[TWVL * 2]));
125 T1J = VZMUL(T1I, VFMAI(T1b, T1a));
126 T1e = VFMA(LDK(KP923879532), TL, TE);
127 T1f = VFMA(LDK(KP923879532), TS, TP);
128 T1d = LDW(&(W[TWVL * 28]));
129 T1g = VZMULI(T1d, VFNMSI(T1f, T1e));
130 T1k = LDW(&(W[0]));
131 T1l = VZMULI(T1k, VFMAI(T1f, T1e));
132 T1L = VFMA(LDK(KP923879532), T1u, T1t);
133 T1M = VFNMS(LDK(KP923879532), T1x, T1w);
134 T1K = LDW(&(W[TWVL * 4]));
135 T1N = VZMULI(T1K, VFNMSI(T1M, T1L));
136 T1S = LDW(&(W[TWVL * 24]));
137 T1T = VZMULI(T1S, VFMAI(T1M, T1L));
138 T1h = VCONJ(VSUB(T1c, T1g));
139 ST(&(Rm[WS(rs, 7)]), T1h, -ms, &(Rm[WS(rs, 1)]));
140 T1U = VCONJ(VSUB(T1R, T1T));
141 ST(&(Rm[WS(rs, 6)]), T1U, -ms, &(Rm[0]));
142 T1V = VADD(T1R, T1T);
143 ST(&(Rp[WS(rs, 6)]), T1V, ms, &(Rp[0]));
144 T1i = VADD(T1c, T1g);
145 ST(&(Rp[WS(rs, 7)]), T1i, ms, &(Rp[WS(rs, 1)]));
146 T1m = VCONJ(VSUB(T1j, T1l));
147 ST(&(Rm[0]), T1m, -ms, &(Rm[0]));
148 T1O = VCONJ(VSUB(T1J, T1N));
149 ST(&(Rm[WS(rs, 1)]), T1O, -ms, &(Rm[WS(rs, 1)]));
150 T1P = VADD(T1J, T1N);
151 ST(&(Rp[WS(rs, 1)]), T1P, ms, &(Rp[WS(rs, 1)]));
152 T1n = VADD(T1j, T1l);
153 ST(&(Rp[0]), T1n, ms, &(Rp[0]));
154 TX = LDW(&(W[TWVL * 14]));
155 T14 = VZMUL(TX, VSUB(T10, T13));
156 T1o = LDW(&(W[TWVL * 6]));
157 T1r = VZMUL(T1o, VFMAI(T1q, T1p));
158 To = VFNMS(LDK(KP707106781), Tn, T8);
159 Tx = VFNMS(LDK(KP707106781), Tw, Tv);
160 T1 = LDW(&(W[TWVL * 10]));
161 Ty = VZMUL(T1, VFNMSI(Tx, To));
162 T1C = LDW(&(W[TWVL * 18]));
163 T1D = VZMUL(T1C, VFMAI(Tx, To));
164 TM = VFNMS(LDK(KP923879532), TL, TE);
165 TT = VFNMS(LDK(KP923879532), TS, TP);
166 Tz = LDW(&(W[TWVL * 12]));
167 TU = VZMULI(Tz, VFNMSI(TT, TM));
168 T15 = LDW(&(W[TWVL * 16]));
169 T16 = VZMULI(T15, VFMAI(TT, TM));
170 T1v = VFNMS(LDK(KP923879532), T1u, T1t);
171 T1y = VFMA(LDK(KP923879532), T1x, T1w);
172 T1s = LDW(&(W[TWVL * 8]));
173 T1z = VZMULI(T1s, VFMAI(T1y, T1v));
174 T1E = LDW(&(W[TWVL * 20]));
175 T1F = VZMULI(T1E, VFNMSI(T1y, T1v));
176 TV = VCONJ(VSUB(Ty, TU));
177 ST(&(Rm[WS(rs, 3)]), TV, -ms, &(Rm[WS(rs, 1)]));
178 T1G = VCONJ(VSUB(T1D, T1F));
179 ST(&(Rm[WS(rs, 5)]), T1G, -ms, &(Rm[WS(rs, 1)]));
180 T1H = VADD(T1D, T1F);
181 ST(&(Rp[WS(rs, 5)]), T1H, ms, &(Rp[WS(rs, 1)]));
182 TW = VADD(Ty, TU);
183 ST(&(Rp[WS(rs, 3)]), TW, ms, &(Rp[WS(rs, 1)]));
184 T17 = VCONJ(VSUB(T14, T16));
185 ST(&(Rm[WS(rs, 4)]), T17, -ms, &(Rm[0]));
186 T1A = VCONJ(VSUB(T1r, T1z));
187 ST(&(Rm[WS(rs, 2)]), T1A, -ms, &(Rm[0]));
188 T1B = VADD(T1r, T1z);
189 ST(&(Rp[WS(rs, 2)]), T1B, ms, &(Rp[0]));
190 T18 = VADD(T14, T16);
191 ST(&(Rp[WS(rs, 4)]), T18, ms, &(Rp[0]));
192 }
193 }
194 }
195 VLEAVE();
196 }
197
198 static const tw_instr twinstr[] = {
199 VTW(1, 1),
200 VTW(1, 2),
201 VTW(1, 3),
202 VTW(1, 4),
203 VTW(1, 5),
204 VTW(1, 6),
205 VTW(1, 7),
206 VTW(1, 8),
207 VTW(1, 9),
208 VTW(1, 10),
209 VTW(1, 11),
210 VTW(1, 12),
211 VTW(1, 13),
212 VTW(1, 14),
213 VTW(1, 15),
214 {TW_NEXT, VL, 0}
215 };
216
217 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cbdftv_16"), twinstr, &GENUS, {53, 30, 50, 0} };
218
219 void XSIMD(codelet_hc2cbdftv_16) (planner *p) {
220 X(khc2c_register) (p, hc2cbdftv_16, &desc, HC2C_VIA_DFT);
221 }
222 #else
223
224 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dif -sign 1 -name hc2cbdftv_16 -include rdft/simd/hc2cbv.h */
225
226 /*
227 * This function contains 103 FP additions, 42 FP multiplications,
228 * (or, 99 additions, 38 multiplications, 4 fused multiply/add),
229 * 83 stack variables, 3 constants, and 32 memory accesses
230 */
231 #include "rdft/simd/hc2cbv.h"
232
233 static void hc2cbdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
234 {
235 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
236 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
237 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
238 {
239 INT m;
240 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
241 V Tf, T16, TZ, T1C, TI, T1a, TV, T1D, T1F, T1G, Ty, T19, TC, T17, TS;
242 V T10;
243 {
244 V T2, TD, T4, TF, Tc, Tb, Td, T6, T8, T9, T3, TE, Ta, T7, T5;
245 V Te, TX, TY, TG, TH, TT, TU, Tj, TM, Tw, TQ, Tn, TN, Ts, TP;
246 V Tg, Ti, Th, Tt, Tv, Tu, Tk, Tm, Tl, Tr, Tq, Tp, To, Tx, TA;
247 V TB, TO, TR;
248 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
249 TD = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
250 T3 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
251 T4 = VCONJ(T3);
252 TE = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
253 TF = VCONJ(TE);
254 Tc = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
255 Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
256 Tb = VCONJ(Ta);
257 Td = VSUB(Tb, Tc);
258 T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
259 T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
260 T8 = VCONJ(T7);
261 T9 = VSUB(T6, T8);
262 T5 = VSUB(T2, T4);
263 Te = VMUL(LDK(KP707106781), VADD(T9, Td));
264 Tf = VADD(T5, Te);
265 T16 = VSUB(T5, Te);
266 TX = VADD(T2, T4);
267 TY = VADD(TD, TF);
268 TZ = VSUB(TX, TY);
269 T1C = VADD(TX, TY);
270 TG = VSUB(TD, TF);
271 TH = VMUL(LDK(KP707106781), VSUB(T9, Td));
272 TI = VADD(TG, TH);
273 T1a = VSUB(TH, TG);
274 TT = VADD(T6, T8);
275 TU = VADD(Tb, Tc);
276 TV = VSUB(TT, TU);
277 T1D = VADD(TT, TU);
278 Tg = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
279 Th = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
280 Ti = VCONJ(Th);
281 Tj = VSUB(Tg, Ti);
282 TM = VADD(Tg, Ti);
283 Tt = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
284 Tu = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
285 Tv = VCONJ(Tu);
286 Tw = VSUB(Tt, Tv);
287 TQ = VADD(Tt, Tv);
288 Tk = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
289 Tl = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
290 Tm = VCONJ(Tl);
291 Tn = VSUB(Tk, Tm);
292 TN = VADD(Tk, Tm);
293 Tr = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
294 Tp = LD(&(Rm[0]), -ms, &(Rm[0]));
295 Tq = VCONJ(Tp);
296 Ts = VSUB(Tq, Tr);
297 TP = VADD(Tq, Tr);
298 T1F = VADD(TM, TN);
299 T1G = VADD(TP, TQ);
300 To = VFNMS(LDK(KP382683432), Tn, VMUL(LDK(KP923879532), Tj));
301 Tx = VFMA(LDK(KP923879532), Ts, VMUL(LDK(KP382683432), Tw));
302 Ty = VADD(To, Tx);
303 T19 = VSUB(To, Tx);
304 TA = VFMA(LDK(KP382683432), Tj, VMUL(LDK(KP923879532), Tn));
305 TB = VFNMS(LDK(KP382683432), Ts, VMUL(LDK(KP923879532), Tw));
306 TC = VADD(TA, TB);
307 T17 = VSUB(TA, TB);
308 TO = VSUB(TM, TN);
309 TR = VSUB(TP, TQ);
310 TS = VMUL(LDK(KP707106781), VSUB(TO, TR));
311 T10 = VMUL(LDK(KP707106781), VADD(TO, TR));
312 }
313 {
314 V T21, T1W, T1u, T20, T1I, T1O, TK, T1S, T12, T1e, T1k, T1A, T1o, T1w, T1c;
315 V T1M, T1U, T1V, T1T, T1s, T1t, T1r, T1Z, T1E, T1H, T1B, T1N, Tz, TJ, T1;
316 V T1R, TW, T11, TL, T1d, T1i, T1j, T1h, T1z, T1m, T1n, T1l, T1v, T18, T1b;
317 V T15, T1L, T13, T1g, T1X, T23, T14, T1f, T1Y, T22, T1p, T1y, T1J, T1Q, T1q;
318 V T1x, T1K, T1P;
319 T1U = VADD(T1C, T1D);
320 T1V = VADD(T1F, T1G);
321 T21 = VADD(T1U, T1V);
322 T1T = LDW(&(W[TWVL * 14]));
323 T1W = VZMUL(T1T, VSUB(T1U, T1V));
324 T1s = VADD(Tf, Ty);
325 T1t = VBYI(VADD(TI, TC));
326 T1r = LDW(&(W[TWVL * 28]));
327 T1u = VZMULI(T1r, VSUB(T1s, T1t));
328 T1Z = LDW(&(W[0]));
329 T20 = VZMULI(T1Z, VADD(T1s, T1t));
330 T1E = VSUB(T1C, T1D);
331 T1H = VBYI(VSUB(T1F, T1G));
332 T1B = LDW(&(W[TWVL * 22]));
333 T1I = VZMUL(T1B, VSUB(T1E, T1H));
334 T1N = LDW(&(W[TWVL * 6]));
335 T1O = VZMUL(T1N, VADD(T1E, T1H));
336 Tz = VSUB(Tf, Ty);
337 TJ = VBYI(VSUB(TC, TI));
338 T1 = LDW(&(W[TWVL * 12]));
339 TK = VZMULI(T1, VADD(Tz, TJ));
340 T1R = LDW(&(W[TWVL * 16]));
341 T1S = VZMULI(T1R, VSUB(Tz, TJ));
342 TW = VBYI(VSUB(TS, TV));
343 T11 = VSUB(TZ, T10);
344 TL = LDW(&(W[TWVL * 10]));
345 T12 = VZMUL(TL, VADD(TW, T11));
346 T1d = LDW(&(W[TWVL * 18]));
347 T1e = VZMUL(T1d, VSUB(T11, TW));
348 T1i = VBYI(VADD(T1a, T19));
349 T1j = VADD(T16, T17);
350 T1h = LDW(&(W[TWVL * 4]));
351 T1k = VZMULI(T1h, VADD(T1i, T1j));
352 T1z = LDW(&(W[TWVL * 24]));
353 T1A = VZMULI(T1z, VSUB(T1j, T1i));
354 T1m = VBYI(VADD(TV, TS));
355 T1n = VADD(TZ, T10);
356 T1l = LDW(&(W[TWVL * 2]));
357 T1o = VZMUL(T1l, VADD(T1m, T1n));
358 T1v = LDW(&(W[TWVL * 26]));
359 T1w = VZMUL(T1v, VSUB(T1n, T1m));
360 T18 = VSUB(T16, T17);
361 T1b = VBYI(VSUB(T19, T1a));
362 T15 = LDW(&(W[TWVL * 20]));
363 T1c = VZMULI(T15, VSUB(T18, T1b));
364 T1L = LDW(&(W[TWVL * 8]));
365 T1M = VZMULI(T1L, VADD(T1b, T18));
366 T13 = VADD(TK, T12);
367 ST(&(Rp[WS(rs, 3)]), T13, ms, &(Rp[WS(rs, 1)]));
368 T1g = VCONJ(VSUB(T1e, T1c));
369 ST(&(Rm[WS(rs, 5)]), T1g, -ms, &(Rm[WS(rs, 1)]));
370 T1X = VADD(T1S, T1W);
371 ST(&(Rp[WS(rs, 4)]), T1X, ms, &(Rp[0]));
372 T23 = VCONJ(VSUB(T21, T20));
373 ST(&(Rm[0]), T23, -ms, &(Rm[0]));
374 T14 = VCONJ(VSUB(T12, TK));
375 ST(&(Rm[WS(rs, 3)]), T14, -ms, &(Rm[WS(rs, 1)]));
376 T1f = VADD(T1c, T1e);
377 ST(&(Rp[WS(rs, 5)]), T1f, ms, &(Rp[WS(rs, 1)]));
378 T1Y = VCONJ(VSUB(T1W, T1S));
379 ST(&(Rm[WS(rs, 4)]), T1Y, -ms, &(Rm[0]));
380 T22 = VADD(T20, T21);
381 ST(&(Rp[0]), T22, ms, &(Rp[0]));
382 T1p = VADD(T1k, T1o);
383 ST(&(Rp[WS(rs, 1)]), T1p, ms, &(Rp[WS(rs, 1)]));
384 T1y = VCONJ(VSUB(T1w, T1u));
385 ST(&(Rm[WS(rs, 7)]), T1y, -ms, &(Rm[WS(rs, 1)]));
386 T1J = VADD(T1A, T1I);
387 ST(&(Rp[WS(rs, 6)]), T1J, ms, &(Rp[0]));
388 T1Q = VCONJ(VSUB(T1O, T1M));
389 ST(&(Rm[WS(rs, 2)]), T1Q, -ms, &(Rm[0]));
390 T1q = VCONJ(VSUB(T1o, T1k));
391 ST(&(Rm[WS(rs, 1)]), T1q, -ms, &(Rm[WS(rs, 1)]));
392 T1x = VADD(T1u, T1w);
393 ST(&(Rp[WS(rs, 7)]), T1x, ms, &(Rp[WS(rs, 1)]));
394 T1K = VCONJ(VSUB(T1I, T1A));
395 ST(&(Rm[WS(rs, 6)]), T1K, -ms, &(Rm[0]));
396 T1P = VADD(T1M, T1O);
397 ST(&(Rp[WS(rs, 2)]), T1P, ms, &(Rp[0]));
398 }
399 }
400 }
401 VLEAVE();
402 }
403
404 static const tw_instr twinstr[] = {
405 VTW(1, 1),
406 VTW(1, 2),
407 VTW(1, 3),
408 VTW(1, 4),
409 VTW(1, 5),
410 VTW(1, 6),
411 VTW(1, 7),
412 VTW(1, 8),
413 VTW(1, 9),
414 VTW(1, 10),
415 VTW(1, 11),
416 VTW(1, 12),
417 VTW(1, 13),
418 VTW(1, 14),
419 VTW(1, 15),
420 {TW_NEXT, VL, 0}
421 };
422
423 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cbdftv_16"), twinstr, &GENUS, {99, 38, 4, 0} };
424
425 void XSIMD(codelet_hc2cbdftv_16) (planner *p) {
426 X(khc2c_register) (p, hc2cbdftv_16, &desc, HC2C_VIA_DFT);
427 }
428 #endif