comparison src/fftw-3.3.8/rdft/simd/common/hc2cfdftv_20.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:08:11 EDT 2018 */
23
24 #include "rdft/codelet-rdft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dit -name hc2cfdftv_20 -include rdft/simd/hc2cfv.h */
29
30 /*
31 * This function contains 143 FP additions, 128 FP multiplications,
32 * (or, 77 additions, 62 multiplications, 66 fused multiply/add),
33 * 129 stack variables, 5 constants, and 40 memory accesses
34 */
35 #include "rdft/simd/hc2cfv.h"
36
37 static void hc2cfdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
40 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
43 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
44 {
45 INT m;
46 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
47 V T1O, T2j, T2c, T2b, T2i, T1X, Tx, TM, TN, T1x, T1y, T1z, T1u, T1v, T1w;
48 V T12, T1d, T1e, T24, T2g, Ti, T1t, T1V, T29, T26, T27, T1W, T25, T1H, T1L;
49 V T1B, T1K, T1E, T1F, T1G, T1D, T1A, T1C, T1N, T1I, T1J, T1M;
50 {
51 V T3, T1Y, TC, T7, Tn, T1P, Tc, Tg, Tw, T1Z, TS, T1S, TL, T21, T17;
52 V T1Q, T11, T22, T1c, T1T, T1, T2, Tz, T5, T6, TB, Ty, TA, T4, Ta;
53 V Tb, Tk, Te, Tf, Tm, Tj, Tl, T9, Td, T20, T23, T8, Th, T1R, T1U;
54 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
55 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
56 Ty = LDW(&(W[0]));
57 Tz = VZMULIJ(Ty, VFNMSCONJ(T2, T1));
58 T5 = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
59 T6 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
60 TA = LDW(&(W[TWVL * 20]));
61 TB = VZMULIJ(TA, VFNMSCONJ(T6, T5));
62 T3 = VFMACONJ(T2, T1);
63 T1Y = VSUB(TB, Tz);
64 TC = VADD(Tz, TB);
65 T4 = LDW(&(W[TWVL * 18]));
66 T7 = VZMULJ(T4, VFMACONJ(T6, T5));
67 Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
68 Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
69 Tj = LDW(&(W[TWVL * 6]));
70 Tk = VZMULJ(Tj, VFMACONJ(Tb, Ta));
71 Te = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
72 Tf = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
73 Tl = LDW(&(W[TWVL * 26]));
74 Tm = VZMULJ(Tl, VFMACONJ(Tf, Te));
75 Tn = VADD(Tk, Tm);
76 T1P = VSUB(Tk, Tm);
77 T9 = LDW(&(W[TWVL * 8]));
78 Tc = VZMULIJ(T9, VFNMSCONJ(Tb, Ta));
79 Td = LDW(&(W[TWVL * 28]));
80 Tg = VZMULIJ(Td, VFNMSCONJ(Tf, Te));
81 {
82 V Tr, TP, Tv, TR, Tp, Tq, To, TO, Tt, Tu, Ts, TQ, TG, T14, TK;
83 V T16, TE, TF, TD, T13, TI, TJ, TH, T15, TW, T19, T10, T1b, TU, TV;
84 V TT, T18, TY, TZ, TX, T1a;
85 Tp = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
86 Tq = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
87 To = LDW(&(W[TWVL * 16]));
88 Tr = VZMULIJ(To, VFNMSCONJ(Tq, Tp));
89 TO = LDW(&(W[TWVL * 14]));
90 TP = VZMULJ(TO, VFMACONJ(Tq, Tp));
91 Tt = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
92 Tu = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
93 Ts = LDW(&(W[TWVL * 36]));
94 Tv = VZMULIJ(Ts, VFNMSCONJ(Tu, Tt));
95 TQ = LDW(&(W[TWVL * 34]));
96 TR = VZMULJ(TQ, VFMACONJ(Tu, Tt));
97 Tw = VADD(Tr, Tv);
98 T1Z = VSUB(Tv, Tr);
99 TS = VADD(TP, TR);
100 T1S = VSUB(TP, TR);
101 TE = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
102 TF = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
103 TD = LDW(&(W[TWVL * 30]));
104 TG = VZMULJ(TD, VFMACONJ(TF, TE));
105 T13 = LDW(&(W[TWVL * 32]));
106 T14 = VZMULIJ(T13, VFNMSCONJ(TF, TE));
107 TI = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
108 TJ = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
109 TH = LDW(&(W[TWVL * 10]));
110 TK = VZMULJ(TH, VFMACONJ(TJ, TI));
111 T15 = LDW(&(W[TWVL * 12]));
112 T16 = VZMULIJ(T15, VFNMSCONJ(TJ, TI));
113 TL = VADD(TG, TK);
114 T21 = VSUB(T16, T14);
115 T17 = VADD(T14, T16);
116 T1Q = VSUB(TK, TG);
117 TU = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
118 TV = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
119 TT = LDW(&(W[TWVL * 24]));
120 TW = VZMULIJ(TT, VFNMSCONJ(TV, TU));
121 T18 = LDW(&(W[TWVL * 22]));
122 T19 = VZMULJ(T18, VFMACONJ(TV, TU));
123 TY = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
124 TZ = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
125 TX = LDW(&(W[TWVL * 4]));
126 T10 = VZMULIJ(TX, VFNMSCONJ(TZ, TY));
127 T1a = LDW(&(W[TWVL * 2]));
128 T1b = VZMULJ(T1a, VFMACONJ(TZ, TY));
129 T11 = VADD(TW, T10);
130 T22 = VSUB(T10, TW);
131 T1c = VADD(T19, T1b);
132 T1T = VSUB(T1b, T19);
133 }
134 T1O = VSUB(T3, T7);
135 T2j = VADD(T1S, T1T);
136 T2c = VSUB(T21, T22);
137 T2b = VSUB(T1Y, T1Z);
138 T2i = VADD(T1P, T1Q);
139 T1X = VSUB(Tg, Tc);
140 Tx = VSUB(Tn, Tw);
141 TM = VSUB(TC, TL);
142 TN = VSUB(Tx, TM);
143 T1x = VADD(TS, T11);
144 T1y = VADD(T17, T1c);
145 T1z = VADD(T1x, T1y);
146 T1u = VADD(Tn, Tw);
147 T1v = VADD(TC, TL);
148 T1w = VADD(T1u, T1v);
149 T12 = VSUB(TS, T11);
150 T1d = VSUB(T17, T1c);
151 T1e = VSUB(T12, T1d);
152 T20 = VADD(T1Y, T1Z);
153 T23 = VADD(T21, T22);
154 T24 = VADD(T20, T23);
155 T2g = VSUB(T23, T20);
156 T8 = VADD(T3, T7);
157 Th = VADD(Tc, Tg);
158 Ti = VSUB(T8, Th);
159 T1t = VADD(T8, Th);
160 T1R = VSUB(T1P, T1Q);
161 T1U = VSUB(T1S, T1T);
162 T1V = VADD(T1R, T1U);
163 T29 = VSUB(T1R, T1U);
164 }
165 T1W = VADD(T1O, T1V);
166 T25 = VADD(T1X, T24);
167 T26 = VMUL(LDK(KP500000000), VFNMSI(T25, T1W));
168 T27 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T25, T1W)));
169 ST(&(Rp[WS(rs, 5)]), T26, ms, &(Rp[WS(rs, 1)]));
170 ST(&(Rm[WS(rs, 4)]), T27, -ms, &(Rm[0]));
171 T1F = VSUB(T1x, T1y);
172 T1G = VSUB(T1u, T1v);
173 T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
174 T1L = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
175 T1D = VSUB(T1w, T1z);
176 T1A = VADD(T1w, T1z);
177 T1C = VFNMS(LDK(KP250000000), T1A, T1t);
178 T1B = VCONJ(VMUL(LDK(KP500000000), VADD(T1t, T1A)));
179 T1K = VFMA(LDK(KP559016994), T1D, T1C);
180 T1E = VFNMS(LDK(KP559016994), T1D, T1C);
181 ST(&(Rm[WS(rs, 9)]), T1B, -ms, &(Rm[WS(rs, 1)]));
182 T1N = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1L, T1K)));
183 ST(&(Rm[WS(rs, 5)]), T1N, -ms, &(Rm[WS(rs, 1)]));
184 T1I = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1H, T1E)));
185 ST(&(Rm[WS(rs, 1)]), T1I, -ms, &(Rm[WS(rs, 1)]));
186 T1J = VMUL(LDK(KP500000000), VFMAI(T1H, T1E));
187 ST(&(Rp[WS(rs, 2)]), T1J, ms, &(Rp[0]));
188 T1M = VMUL(LDK(KP500000000), VFNMSI(T1L, T1K));
189 ST(&(Rp[WS(rs, 6)]), T1M, ms, &(Rp[0]));
190 {
191 V T1m, T1q, T1g, T1p, T1j, T1k, T1l, T1i, T1f, T1h, T1s, T1n, T1o, T1r, T2e;
192 V T2A, T2o, T2u, T2l, T2B, T2p, T2x, T2d, T2t, T2a, T2s, T28, T2k, T2w, T2h;
193 V T2v, T2f, T2m, T2C, T2D, T2n, T2q, T2y, T2z, T2r;
194 T1k = VADD(Tx, TM);
195 T1l = VADD(T12, T1d);
196 T1m = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1l, T1k));
197 T1q = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1k, T1l));
198 T1i = VSUB(TN, T1e);
199 T1f = VADD(TN, T1e);
200 T1h = VFNMS(LDK(KP250000000), T1f, Ti);
201 T1g = VMUL(LDK(KP500000000), VADD(Ti, T1f));
202 T1p = VFNMS(LDK(KP559016994), T1i, T1h);
203 T1j = VFMA(LDK(KP559016994), T1i, T1h);
204 ST(&(Rp[0]), T1g, ms, &(Rp[0]));
205 T1s = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1q, T1p)));
206 ST(&(Rm[WS(rs, 7)]), T1s, -ms, &(Rm[WS(rs, 1)]));
207 T1n = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1m, T1j)));
208 ST(&(Rm[WS(rs, 3)]), T1n, -ms, &(Rm[WS(rs, 1)]));
209 T1o = VMUL(LDK(KP500000000), VFMAI(T1m, T1j));
210 ST(&(Rp[WS(rs, 4)]), T1o, ms, &(Rp[0]));
211 T1r = VMUL(LDK(KP500000000), VFNMSI(T1q, T1p));
212 ST(&(Rp[WS(rs, 8)]), T1r, ms, &(Rp[0]));
213 T2d = VFMA(LDK(KP618033988), T2c, T2b);
214 T2t = VFNMS(LDK(KP618033988), T2b, T2c);
215 T28 = VFNMS(LDK(KP250000000), T1V, T1O);
216 T2a = VFMA(LDK(KP559016994), T29, T28);
217 T2s = VFNMS(LDK(KP559016994), T29, T28);
218 T2e = VFNMS(LDK(KP951056516), T2d, T2a);
219 T2A = VFMA(LDK(KP951056516), T2t, T2s);
220 T2o = VFMA(LDK(KP951056516), T2d, T2a);
221 T2u = VFNMS(LDK(KP951056516), T2t, T2s);
222 T2k = VFMA(LDK(KP618033988), T2j, T2i);
223 T2w = VFNMS(LDK(KP618033988), T2i, T2j);
224 T2f = VFNMS(LDK(KP250000000), T24, T1X);
225 T2h = VFNMS(LDK(KP559016994), T2g, T2f);
226 T2v = VFMA(LDK(KP559016994), T2g, T2f);
227 T2l = VFNMS(LDK(KP951056516), T2k, T2h);
228 T2B = VFMA(LDK(KP951056516), T2w, T2v);
229 T2p = VFMA(LDK(KP951056516), T2k, T2h);
230 T2x = VFNMS(LDK(KP951056516), T2w, T2v);
231 T2m = VMUL(LDK(KP500000000), VFNMSI(T2l, T2e));
232 ST(&(Rp[WS(rs, 9)]), T2m, ms, &(Rp[WS(rs, 1)]));
233 T2C = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2B, T2A)));
234 ST(&(Rm[WS(rs, 6)]), T2C, -ms, &(Rm[0]));
235 T2D = VMUL(LDK(KP500000000), VFMAI(T2B, T2A));
236 ST(&(Rp[WS(rs, 7)]), T2D, ms, &(Rp[WS(rs, 1)]));
237 T2n = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2l, T2e)));
238 ST(&(Rm[WS(rs, 8)]), T2n, -ms, &(Rm[0]));
239 T2q = VMUL(LDK(KP500000000), VFNMSI(T2p, T2o));
240 ST(&(Rp[WS(rs, 1)]), T2q, ms, &(Rp[WS(rs, 1)]));
241 T2y = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2x, T2u)));
242 ST(&(Rm[WS(rs, 2)]), T2y, -ms, &(Rm[0]));
243 T2z = VMUL(LDK(KP500000000), VFMAI(T2x, T2u));
244 ST(&(Rp[WS(rs, 3)]), T2z, ms, &(Rp[WS(rs, 1)]));
245 T2r = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2p, T2o)));
246 ST(&(Rm[0]), T2r, -ms, &(Rm[0]));
247 }
248 }
249 }
250 VLEAVE();
251 }
252
253 static const tw_instr twinstr[] = {
254 VTW(1, 1),
255 VTW(1, 2),
256 VTW(1, 3),
257 VTW(1, 4),
258 VTW(1, 5),
259 VTW(1, 6),
260 VTW(1, 7),
261 VTW(1, 8),
262 VTW(1, 9),
263 VTW(1, 10),
264 VTW(1, 11),
265 VTW(1, 12),
266 VTW(1, 13),
267 VTW(1, 14),
268 VTW(1, 15),
269 VTW(1, 16),
270 VTW(1, 17),
271 VTW(1, 18),
272 VTW(1, 19),
273 {TW_NEXT, VL, 0}
274 };
275
276 static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cfdftv_20"), twinstr, &GENUS, {77, 62, 66, 0} };
277
278 void XSIMD(codelet_hc2cfdftv_20) (planner *p) {
279 X(khc2c_register) (p, hc2cfdftv_20, &desc, HC2C_VIA_DFT);
280 }
281 #else
282
283 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dit -name hc2cfdftv_20 -include rdft/simd/hc2cfv.h */
284
285 /*
286 * This function contains 143 FP additions, 77 FP multiplications,
287 * (or, 131 additions, 65 multiplications, 12 fused multiply/add),
288 * 141 stack variables, 9 constants, and 40 memory accesses
289 */
290 #include "rdft/simd/hc2cfv.h"
291
292 static void hc2cfdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
293 {
294 DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
295 DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
296 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
297 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
298 DVK(KP125000000, +0.125000000000000000000000000000000000000000000);
299 DVK(KP279508497, +0.279508497187473712051146708591409529430077295);
300 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
301 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
302 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
303 {
304 INT m;
305 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
306 V TW, T1x, T2i, T2A, T1r, T1s, T1a, T1y, T1l, Tn, TK, TL, T1p, T1o, T27;
307 V T2t, T2a, T2u, T2e, T2C, T20, T2w, T23, T2x, T2d, T2B, T1W, T1X, T1U, T1V;
308 V T2z, T2K, T2G, T2N, T2J, T2v, T2y, T2F, T2D, T2E, T2M, T2H, T2I, T2L;
309 {
310 V T1u, T5, Tg, T1c, TV, T13, Ta, T1w, TQ, T11, TI, T1j, Tx, T18, Tl;
311 V T1e, TD, T1h, Ts, T16, T2g, T2h, T14, T19, T1f, T1k, Tb, Tm, Ty, TJ;
312 V T25, T26, T28, T29, T1Y, T1Z, T21, T22;
313 {
314 V T4, T3, T2, T1, Tf, Te, Td, Tc, T1b, TU, TT, TS, TR, T12, T9;
315 V T8, T7, T6, T1v, TP, TO, TN, TM, T10, TH, TG, TF, TE, T1i, Tw;
316 V Tv, Tu, Tt, T17, Tk, Tj, Ti, Th, T1d, TC, TB, TA, Tz, T1g, Tr;
317 V Tq, Tp, To, T15;
318 T4 = LD(&(Rp[0]), ms, &(Rp[0]));
319 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
320 T3 = VCONJ(T2);
321 T1u = VADD(T4, T3);
322 T1 = LDW(&(W[0]));
323 T5 = VZMULIJ(T1, VSUB(T3, T4));
324 Tf = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
325 Td = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
326 Te = VCONJ(Td);
327 Tc = LDW(&(W[TWVL * 16]));
328 Tg = VZMULIJ(Tc, VSUB(Te, Tf));
329 T1b = LDW(&(W[TWVL * 14]));
330 T1c = VZMULJ(T1b, VADD(Te, Tf));
331 TU = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
332 TS = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
333 TT = VCONJ(TS);
334 TR = LDW(&(W[TWVL * 28]));
335 TV = VZMULIJ(TR, VSUB(TT, TU));
336 T12 = LDW(&(W[TWVL * 26]));
337 T13 = VZMULJ(T12, VADD(TT, TU));
338 T9 = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
339 T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
340 T8 = VCONJ(T7);
341 T6 = LDW(&(W[TWVL * 20]));
342 Ta = VZMULIJ(T6, VSUB(T8, T9));
343 T1v = LDW(&(W[TWVL * 18]));
344 T1w = VZMULJ(T1v, VADD(T9, T8));
345 TP = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
346 TN = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
347 TO = VCONJ(TN);
348 TM = LDW(&(W[TWVL * 8]));
349 TQ = VZMULIJ(TM, VSUB(TO, TP));
350 T10 = LDW(&(W[TWVL * 6]));
351 T11 = VZMULJ(T10, VADD(TO, TP));
352 TH = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
353 TF = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
354 TG = VCONJ(TF);
355 TE = LDW(&(W[TWVL * 4]));
356 TI = VZMULIJ(TE, VSUB(TG, TH));
357 T1i = LDW(&(W[TWVL * 2]));
358 T1j = VZMULJ(T1i, VADD(TG, TH));
359 Tw = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
360 Tu = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
361 Tv = VCONJ(Tu);
362 Tt = LDW(&(W[TWVL * 12]));
363 Tx = VZMULIJ(Tt, VSUB(Tv, Tw));
364 T17 = LDW(&(W[TWVL * 10]));
365 T18 = VZMULJ(T17, VADD(Tw, Tv));
366 Tk = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
367 Ti = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
368 Tj = VCONJ(Ti);
369 Th = LDW(&(W[TWVL * 36]));
370 Tl = VZMULIJ(Th, VSUB(Tj, Tk));
371 T1d = LDW(&(W[TWVL * 34]));
372 T1e = VZMULJ(T1d, VADD(Tj, Tk));
373 TC = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
374 TA = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
375 TB = VCONJ(TA);
376 Tz = LDW(&(W[TWVL * 24]));
377 TD = VZMULIJ(Tz, VSUB(TB, TC));
378 T1g = LDW(&(W[TWVL * 22]));
379 T1h = VZMULJ(T1g, VADD(TB, TC));
380 Tr = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
381 Tp = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
382 Tq = VCONJ(Tp);
383 To = LDW(&(W[TWVL * 32]));
384 Ts = VZMULIJ(To, VSUB(Tq, Tr));
385 T15 = LDW(&(W[TWVL * 30]));
386 T16 = VZMULJ(T15, VADD(Tr, Tq));
387 }
388 TW = VSUB(TQ, TV);
389 T1x = VSUB(T1u, T1w);
390 T2g = VADD(T1u, T1w);
391 T2h = VADD(TQ, TV);
392 T2i = VADD(T2g, T2h);
393 T2A = VSUB(T2g, T2h);
394 T14 = VSUB(T11, T13);
395 T19 = VSUB(T16, T18);
396 T1r = VADD(T14, T19);
397 T1f = VSUB(T1c, T1e);
398 T1k = VSUB(T1h, T1j);
399 T1s = VADD(T1f, T1k);
400 T1a = VSUB(T14, T19);
401 T1y = VADD(T1r, T1s);
402 T1l = VSUB(T1f, T1k);
403 Tb = VSUB(T5, Ta);
404 Tm = VSUB(Tg, Tl);
405 Tn = VADD(Tb, Tm);
406 Ty = VSUB(Ts, Tx);
407 TJ = VSUB(TD, TI);
408 TK = VADD(Ty, TJ);
409 TL = VADD(Tn, TK);
410 T1p = VSUB(Ty, TJ);
411 T1o = VSUB(Tb, Tm);
412 T25 = VADD(T1c, T1e);
413 T26 = VADD(TD, TI);
414 T27 = VADD(T25, T26);
415 T2t = VSUB(T25, T26);
416 T28 = VADD(Ts, Tx);
417 T29 = VADD(T1h, T1j);
418 T2a = VADD(T28, T29);
419 T2u = VSUB(T29, T28);
420 T2e = VADD(T27, T2a);
421 T2C = VADD(T2t, T2u);
422 T1Y = VADD(T11, T13);
423 T1Z = VADD(Tg, Tl);
424 T20 = VADD(T1Y, T1Z);
425 T2w = VSUB(T1Y, T1Z);
426 T21 = VADD(T5, Ta);
427 T22 = VADD(T16, T18);
428 T23 = VADD(T21, T22);
429 T2x = VSUB(T22, T21);
430 T2d = VADD(T20, T23);
431 T2B = VADD(T2w, T2x);
432 }
433 T1U = VADD(T1x, T1y);
434 T1V = VBYI(VADD(TW, TL));
435 T1W = VMUL(LDK(KP500000000), VSUB(T1U, T1V));
436 T1X = VCONJ(VMUL(LDK(KP500000000), VADD(T1V, T1U)));
437 ST(&(Rp[WS(rs, 5)]), T1W, ms, &(Rp[WS(rs, 1)]));
438 ST(&(Rm[WS(rs, 4)]), T1X, -ms, &(Rm[0]));
439 T2v = VSUB(T2t, T2u);
440 T2y = VSUB(T2w, T2x);
441 T2z = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T2y, VMUL(LDK(KP951056516), T2v))));
442 T2K = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T2y, VMUL(LDK(KP587785252), T2v))));
443 T2F = VMUL(LDK(KP279508497), VSUB(T2B, T2C));
444 T2D = VADD(T2B, T2C);
445 T2E = VFNMS(LDK(KP125000000), T2D, VMUL(LDK(KP500000000), T2A));
446 T2G = VSUB(T2E, T2F);
447 T2N = VCONJ(VMUL(LDK(KP500000000), VADD(T2A, T2D)));
448 T2J = VADD(T2F, T2E);
449 ST(&(Rm[WS(rs, 9)]), T2N, -ms, &(Rm[WS(rs, 1)]));
450 T2M = VCONJ(VADD(T2K, T2J));
451 ST(&(Rm[WS(rs, 5)]), T2M, -ms, &(Rm[WS(rs, 1)]));
452 T2H = VADD(T2z, T2G);
453 ST(&(Rp[WS(rs, 2)]), T2H, ms, &(Rp[0]));
454 T2I = VCONJ(VSUB(T2G, T2z));
455 ST(&(Rm[WS(rs, 1)]), T2I, -ms, &(Rm[WS(rs, 1)]));
456 T2L = VSUB(T2J, T2K);
457 ST(&(Rp[WS(rs, 6)]), T2L, ms, &(Rp[0]));
458 {
459 V T2c, T2p, T2l, T2s, T2o, T24, T2b, T2f, T2j, T2k, T2r, T2m, T2n, T2q, T1n;
460 V T1Q, T1E, T1K, T1B, T1R, T1F, T1N, T1m, T1J, TZ, T1I, TX, TY, T1q, T1M;
461 V T1A, T1L, T1t, T1z, T1C, T1S, T1T, T1D, T1G, T1O, T1P, T1H;
462 T24 = VSUB(T20, T23);
463 T2b = VSUB(T27, T2a);
464 T2c = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T24, VMUL(LDK(KP587785252), T2b))));
465 T2p = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T24, VMUL(LDK(KP951056516), T2b))));
466 T2f = VMUL(LDK(KP279508497), VSUB(T2d, T2e));
467 T2j = VADD(T2d, T2e);
468 T2k = VFNMS(LDK(KP125000000), T2j, VMUL(LDK(KP500000000), T2i));
469 T2l = VADD(T2f, T2k);
470 T2s = VMUL(LDK(KP500000000), VADD(T2i, T2j));
471 T2o = VSUB(T2k, T2f);
472 ST(&(Rp[0]), T2s, ms, &(Rp[0]));
473 T2r = VCONJ(VADD(T2p, T2o));
474 ST(&(Rm[WS(rs, 7)]), T2r, -ms, &(Rm[WS(rs, 1)]));
475 T2m = VADD(T2c, T2l);
476 ST(&(Rp[WS(rs, 4)]), T2m, ms, &(Rp[0]));
477 T2n = VCONJ(VSUB(T2l, T2c));
478 ST(&(Rm[WS(rs, 3)]), T2n, -ms, &(Rm[WS(rs, 1)]));
479 T2q = VSUB(T2o, T2p);
480 ST(&(Rp[WS(rs, 8)]), T2q, ms, &(Rp[0]));
481 T1m = VFMA(LDK(KP951056516), T1a, VMUL(LDK(KP587785252), T1l));
482 T1J = VFNMS(LDK(KP587785252), T1a, VMUL(LDK(KP951056516), T1l));
483 TX = VFMS(LDK(KP250000000), TL, TW);
484 TY = VMUL(LDK(KP559016994), VSUB(TK, Tn));
485 TZ = VADD(TX, TY);
486 T1I = VSUB(TY, TX);
487 T1n = VMUL(LDK(KP500000000), VBYI(VSUB(TZ, T1m)));
488 T1Q = VMUL(LDK(KP500000000), VBYI(VADD(T1I, T1J)));
489 T1E = VMUL(LDK(KP500000000), VBYI(VADD(TZ, T1m)));
490 T1K = VMUL(LDK(KP500000000), VBYI(VSUB(T1I, T1J)));
491 T1q = VFMA(LDK(KP475528258), T1o, VMUL(LDK(KP293892626), T1p));
492 T1M = VFNMS(LDK(KP293892626), T1o, VMUL(LDK(KP475528258), T1p));
493 T1t = VMUL(LDK(KP279508497), VSUB(T1r, T1s));
494 T1z = VFNMS(LDK(KP125000000), T1y, VMUL(LDK(KP500000000), T1x));
495 T1A = VADD(T1t, T1z);
496 T1L = VSUB(T1z, T1t);
497 T1B = VADD(T1q, T1A);
498 T1R = VADD(T1M, T1L);
499 T1F = VSUB(T1A, T1q);
500 T1N = VSUB(T1L, T1M);
501 T1C = VADD(T1n, T1B);
502 ST(&(Rp[WS(rs, 1)]), T1C, ms, &(Rp[WS(rs, 1)]));
503 T1S = VADD(T1Q, T1R);
504 ST(&(Rp[WS(rs, 7)]), T1S, ms, &(Rp[WS(rs, 1)]));
505 T1T = VCONJ(VSUB(T1R, T1Q));
506 ST(&(Rm[WS(rs, 6)]), T1T, -ms, &(Rm[0]));
507 T1D = VCONJ(VSUB(T1B, T1n));
508 ST(&(Rm[0]), T1D, -ms, &(Rm[0]));
509 T1G = VADD(T1E, T1F);
510 ST(&(Rp[WS(rs, 9)]), T1G, ms, &(Rp[WS(rs, 1)]));
511 T1O = VADD(T1K, T1N);
512 ST(&(Rp[WS(rs, 3)]), T1O, ms, &(Rp[WS(rs, 1)]));
513 T1P = VCONJ(VSUB(T1N, T1K));
514 ST(&(Rm[WS(rs, 2)]), T1P, -ms, &(Rm[0]));
515 T1H = VCONJ(VSUB(T1F, T1E));
516 ST(&(Rm[WS(rs, 8)]), T1H, -ms, &(Rm[0]));
517 }
518 }
519 }
520 VLEAVE();
521 }
522
523 static const tw_instr twinstr[] = {
524 VTW(1, 1),
525 VTW(1, 2),
526 VTW(1, 3),
527 VTW(1, 4),
528 VTW(1, 5),
529 VTW(1, 6),
530 VTW(1, 7),
531 VTW(1, 8),
532 VTW(1, 9),
533 VTW(1, 10),
534 VTW(1, 11),
535 VTW(1, 12),
536 VTW(1, 13),
537 VTW(1, 14),
538 VTW(1, 15),
539 VTW(1, 16),
540 VTW(1, 17),
541 VTW(1, 18),
542 VTW(1, 19),
543 {TW_NEXT, VL, 0}
544 };
545
546 static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cfdftv_20"), twinstr, &GENUS, {131, 65, 12, 0} };
547
548 void XSIMD(codelet_hc2cfdftv_20) (planner *p) {
549 X(khc2c_register) (p, hc2cfdftv_20, &desc, HC2C_VIA_DFT);
550 }
551 #endif