comparison src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_20.c @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Sun Nov 25 07:42:29 EST 2012 */
23
24 #include "codelet-rdft.h"
25
26 #ifdef HAVE_FMA
27
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dit -name hc2cfdftv_20 -include hc2cfv.h */
29
30 /*
31 * This function contains 143 FP additions, 128 FP multiplications,
32 * (or, 77 additions, 62 multiplications, 66 fused multiply/add),
33 * 130 stack variables, 5 constants, and 40 memory accesses
34 */
35 #include "hc2cfv.h"
36
37 static void hc2cfdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
43 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
44 {
45 INT m;
46 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
47 V T2g, T2f, T2w, T2k, T2A, T2u, T2e, T2o, T1O, T2b, T2i, T1R, T1X, T1k, TN;
48 V T1w, T1G, T1t, Ti, T2c, T12, T1x, T2j, T1U, T1y, T1d, T24, T2v, T2h, T2x;
49 V T2B, T2p, T2l, T2z, T2y, T2D, T2C, T2r, T2q, T2n, T2m;
50 {
51 V T3, T7, TC, T1Y, Tc, Tg, Tn, T1P, T1Z, Tw, T1S, TS, TY, TZ, T1Q;
52 V TL, T17, T21, TW, T19, TX, T1a, T8, T20, Th, Tx, T1u, T1v, TM, T10;
53 V T1b, T22, T11, T1T, T1c, T23;
54 {
55 V Ta, Tb, Tz, Te, TB, Tf, Tl, T9, Td, Tk, T1, T2, Ty, T5, T6;
56 V TA, T4, Tj, Tt, Tu, Ts, TQ, Tr, TP, Tp, Tq, Tm, To, TO, TG;
57 V T14, TK, T16, TE, TF, Tv, TD, T13, TR, TI, TJ, TH, T15, TU, TV;
58 V TT, T18;
59 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
60 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
61 Ty = LDW(&(W[0]));
62 T5 = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
63 T6 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
64 TA = LDW(&(W[TWVL * 20]));
65 T4 = LDW(&(W[TWVL * 18]));
66 Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
67 Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
68 T3 = VFMACONJ(T2, T1);
69 Tz = VZMULIJ(Ty, VFNMSCONJ(T2, T1));
70 Tj = LDW(&(W[TWVL * 6]));
71 Te = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
72 TB = VZMULIJ(TA, VFNMSCONJ(T6, T5));
73 T7 = VZMULJ(T4, VFMACONJ(T6, T5));
74 Tf = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
75 Tl = LDW(&(W[TWVL * 26]));
76 T9 = LDW(&(W[TWVL * 8]));
77 Td = LDW(&(W[TWVL * 28]));
78 Tk = VZMULJ(Tj, VFMACONJ(Tb, Ta));
79 Tp = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
80 TC = VADD(Tz, TB);
81 T1Y = VSUB(TB, Tz);
82 Tq = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
83 Tm = VZMULJ(Tl, VFMACONJ(Tf, Te));
84 Tc = VZMULIJ(T9, VFNMSCONJ(Tb, Ta));
85 Tg = VZMULIJ(Td, VFNMSCONJ(Tf, Te));
86 To = LDW(&(W[TWVL * 16]));
87 TO = LDW(&(W[TWVL * 14]));
88 Tt = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
89 Tu = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
90 Ts = LDW(&(W[TWVL * 36]));
91 Tn = VADD(Tk, Tm);
92 T1P = VSUB(Tk, Tm);
93 TQ = LDW(&(W[TWVL * 34]));
94 Tr = VZMULIJ(To, VFNMSCONJ(Tq, Tp));
95 TP = VZMULJ(TO, VFMACONJ(Tq, Tp));
96 TE = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
97 TF = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
98 Tv = VZMULIJ(Ts, VFNMSCONJ(Tu, Tt));
99 TD = LDW(&(W[TWVL * 30]));
100 T13 = LDW(&(W[TWVL * 32]));
101 TR = VZMULJ(TQ, VFMACONJ(Tu, Tt));
102 TI = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
103 TJ = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
104 TH = LDW(&(W[TWVL * 10]));
105 T15 = LDW(&(W[TWVL * 12]));
106 T1Z = VSUB(Tv, Tr);
107 Tw = VADD(Tr, Tv);
108 TG = VZMULJ(TD, VFMACONJ(TF, TE));
109 T14 = VZMULIJ(T13, VFNMSCONJ(TF, TE));
110 T1S = VSUB(TP, TR);
111 TS = VADD(TP, TR);
112 TK = VZMULJ(TH, VFMACONJ(TJ, TI));
113 T16 = VZMULIJ(T15, VFNMSCONJ(TJ, TI));
114 TU = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
115 TV = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
116 TT = LDW(&(W[TWVL * 24]));
117 T18 = LDW(&(W[TWVL * 22]));
118 TY = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
119 TZ = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
120 T1Q = VSUB(TK, TG);
121 TL = VADD(TG, TK);
122 T17 = VADD(T14, T16);
123 T21 = VSUB(T16, T14);
124 TW = VZMULIJ(TT, VFNMSCONJ(TV, TU));
125 T19 = VZMULJ(T18, VFMACONJ(TV, TU));
126 TX = LDW(&(W[TWVL * 4]));
127 T1a = LDW(&(W[TWVL * 2]));
128 }
129 T1O = VSUB(T3, T7);
130 T8 = VADD(T3, T7);
131 T20 = VADD(T1Y, T1Z);
132 T2b = VSUB(T1Y, T1Z);
133 T2i = VADD(T1P, T1Q);
134 T1R = VSUB(T1P, T1Q);
135 Th = VADD(Tc, Tg);
136 T1X = VSUB(Tg, Tc);
137 Tx = VSUB(Tn, Tw);
138 T1u = VADD(Tn, Tw);
139 T1v = VADD(TC, TL);
140 TM = VSUB(TC, TL);
141 T10 = VZMULIJ(TX, VFNMSCONJ(TZ, TY));
142 T1b = VZMULJ(T1a, VFMACONJ(TZ, TY));
143 T1k = VADD(Tx, TM);
144 TN = VSUB(Tx, TM);
145 T22 = VSUB(T10, TW);
146 T11 = VADD(TW, T10);
147 T1T = VSUB(T1b, T19);
148 T1c = VADD(T19, T1b);
149 T1w = VADD(T1u, T1v);
150 T1G = VSUB(T1u, T1v);
151 T1t = VADD(T8, Th);
152 Ti = VSUB(T8, Th);
153 T23 = VADD(T21, T22);
154 T2c = VSUB(T21, T22);
155 T12 = VSUB(TS, T11);
156 T1x = VADD(TS, T11);
157 T2j = VADD(T1S, T1T);
158 T1U = VSUB(T1S, T1T);
159 T1y = VADD(T17, T1c);
160 T1d = VSUB(T17, T1c);
161 T2g = VSUB(T23, T20);
162 T24 = VADD(T20, T23);
163 }
164 {
165 V T2d, T2t, T29, T25, T1m, T1q, T1i, T1H, T1L, T1D, T1A, T28, T1W, T1h, T1g;
166 V T1e, T1l, T1z, T1F, T1V, T1f, T1C, T1B, T26, T27, T2a, T2s, T1j, T1p, T1K;
167 V T1E, T1n, T1o, T1s, T1r, T1I, T1J, T1N, T1M;
168 T2d = VFMA(LDK(KP618033988), T2c, T2b);
169 T2t = VFNMS(LDK(KP618033988), T2b, T2c);
170 T1e = VSUB(T12, T1d);
171 T1l = VADD(T12, T1d);
172 T1z = VADD(T1x, T1y);
173 T1F = VSUB(T1x, T1y);
174 T1V = VADD(T1R, T1U);
175 T29 = VSUB(T1R, T1U);
176 T2f = VFNMS(LDK(KP250000000), T24, T1X);
177 T25 = VADD(T1X, T24);
178 T1m = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1l, T1k));
179 T1q = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1k, T1l));
180 T1i = VSUB(TN, T1e);
181 T1f = VADD(TN, T1e);
182 T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
183 T1L = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
184 T1D = VSUB(T1w, T1z);
185 T1A = VADD(T1w, T1z);
186 T28 = VFNMS(LDK(KP250000000), T1V, T1O);
187 T1W = VADD(T1O, T1V);
188 T1h = VFNMS(LDK(KP250000000), T1f, Ti);
189 T1g = VMUL(LDK(KP500000000), VADD(Ti, T1f));
190 T2w = VFNMS(LDK(KP618033988), T2i, T2j);
191 T2k = VFMA(LDK(KP618033988), T2j, T2i);
192 T1C = VFNMS(LDK(KP250000000), T1A, T1t);
193 T1B = VCONJ(VMUL(LDK(KP500000000), VADD(T1t, T1A)));
194 T26 = VMUL(LDK(KP500000000), VFNMSI(T25, T1W));
195 T27 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T25, T1W)));
196 T2a = VFMA(LDK(KP559016994), T29, T28);
197 T2s = VFNMS(LDK(KP559016994), T29, T28);
198 ST(&(Rp[0]), T1g, ms, &(Rp[0]));
199 T1j = VFMA(LDK(KP559016994), T1i, T1h);
200 T1p = VFNMS(LDK(KP559016994), T1i, T1h);
201 ST(&(Rm[WS(rs, 9)]), T1B, -ms, &(Rm[WS(rs, 1)]));
202 T1K = VFMA(LDK(KP559016994), T1D, T1C);
203 T1E = VFNMS(LDK(KP559016994), T1D, T1C);
204 ST(&(Rm[WS(rs, 4)]), T27, -ms, &(Rm[0]));
205 ST(&(Rp[WS(rs, 5)]), T26, ms, &(Rp[WS(rs, 1)]));
206 T2A = VFMA(LDK(KP951056516), T2t, T2s);
207 T2u = VFNMS(LDK(KP951056516), T2t, T2s);
208 T2e = VFNMS(LDK(KP951056516), T2d, T2a);
209 T2o = VFMA(LDK(KP951056516), T2d, T2a);
210 T1n = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1m, T1j)));
211 T1o = VMUL(LDK(KP500000000), VFMAI(T1m, T1j));
212 T1s = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1q, T1p)));
213 T1r = VMUL(LDK(KP500000000), VFNMSI(T1q, T1p));
214 T1I = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1H, T1E)));
215 T1J = VMUL(LDK(KP500000000), VFMAI(T1H, T1E));
216 T1N = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1L, T1K)));
217 T1M = VMUL(LDK(KP500000000), VFNMSI(T1L, T1K));
218 ST(&(Rp[WS(rs, 4)]), T1o, ms, &(Rp[0]));
219 ST(&(Rm[WS(rs, 3)]), T1n, -ms, &(Rm[WS(rs, 1)]));
220 ST(&(Rp[WS(rs, 8)]), T1r, ms, &(Rp[0]));
221 ST(&(Rm[WS(rs, 7)]), T1s, -ms, &(Rm[WS(rs, 1)]));
222 ST(&(Rp[WS(rs, 2)]), T1J, ms, &(Rp[0]));
223 ST(&(Rm[WS(rs, 1)]), T1I, -ms, &(Rm[WS(rs, 1)]));
224 ST(&(Rp[WS(rs, 6)]), T1M, ms, &(Rp[0]));
225 ST(&(Rm[WS(rs, 5)]), T1N, -ms, &(Rm[WS(rs, 1)]));
226 }
227 T2v = VFMA(LDK(KP559016994), T2g, T2f);
228 T2h = VFNMS(LDK(KP559016994), T2g, T2f);
229 T2x = VFNMS(LDK(KP951056516), T2w, T2v);
230 T2B = VFMA(LDK(KP951056516), T2w, T2v);
231 T2p = VFMA(LDK(KP951056516), T2k, T2h);
232 T2l = VFNMS(LDK(KP951056516), T2k, T2h);
233 T2z = VMUL(LDK(KP500000000), VFMAI(T2x, T2u));
234 T2y = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2x, T2u)));
235 T2D = VMUL(LDK(KP500000000), VFMAI(T2B, T2A));
236 T2C = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2B, T2A)));
237 T2r = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2p, T2o)));
238 T2q = VMUL(LDK(KP500000000), VFNMSI(T2p, T2o));
239 T2n = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2l, T2e)));
240 T2m = VMUL(LDK(KP500000000), VFNMSI(T2l, T2e));
241 ST(&(Rp[WS(rs, 3)]), T2z, ms, &(Rp[WS(rs, 1)]));
242 ST(&(Rm[WS(rs, 2)]), T2y, -ms, &(Rm[0]));
243 ST(&(Rp[WS(rs, 7)]), T2D, ms, &(Rp[WS(rs, 1)]));
244 ST(&(Rm[WS(rs, 6)]), T2C, -ms, &(Rm[0]));
245 ST(&(Rm[0]), T2r, -ms, &(Rm[0]));
246 ST(&(Rp[WS(rs, 1)]), T2q, ms, &(Rp[WS(rs, 1)]));
247 ST(&(Rm[WS(rs, 8)]), T2n, -ms, &(Rm[0]));
248 ST(&(Rp[WS(rs, 9)]), T2m, ms, &(Rp[WS(rs, 1)]));
249 }
250 }
251 VLEAVE();
252 }
253
254 static const tw_instr twinstr[] = {
255 VTW(1, 1),
256 VTW(1, 2),
257 VTW(1, 3),
258 VTW(1, 4),
259 VTW(1, 5),
260 VTW(1, 6),
261 VTW(1, 7),
262 VTW(1, 8),
263 VTW(1, 9),
264 VTW(1, 10),
265 VTW(1, 11),
266 VTW(1, 12),
267 VTW(1, 13),
268 VTW(1, 14),
269 VTW(1, 15),
270 VTW(1, 16),
271 VTW(1, 17),
272 VTW(1, 18),
273 VTW(1, 19),
274 {TW_NEXT, VL, 0}
275 };
276
277 static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cfdftv_20"), twinstr, &GENUS, {77, 62, 66, 0} };
278
279 void XSIMD(codelet_hc2cfdftv_20) (planner *p) {
280 X(khc2c_register) (p, hc2cfdftv_20, &desc, HC2C_VIA_DFT);
281 }
282 #else /* HAVE_FMA */
283
284 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dit -name hc2cfdftv_20 -include hc2cfv.h */
285
286 /*
287 * This function contains 143 FP additions, 77 FP multiplications,
288 * (or, 131 additions, 65 multiplications, 12 fused multiply/add),
289 * 141 stack variables, 9 constants, and 40 memory accesses
290 */
291 #include "hc2cfv.h"
292
293 static void hc2cfdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
294 {
295 DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
296 DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
297 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
298 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
299 DVK(KP125000000, +0.125000000000000000000000000000000000000000000);
300 DVK(KP279508497, +0.279508497187473712051146708591409529430077295);
301 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
302 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
303 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
304 {
305 INT m;
306 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
307 V TW, T1x, T2i, T2A, T1r, T1s, T1a, T1y, T1l, Tn, TK, TL, T1p, T1o, T27;
308 V T2t, T2a, T2u, T2e, T2C, T20, T2w, T23, T2x, T2d, T2B, T1W, T1X, T1U, T1V;
309 V T2z, T2K, T2G, T2N, T2J, T2v, T2y, T2F, T2D, T2E, T2M, T2H, T2I, T2L;
310 {
311 V T1u, T5, Tg, T1c, TV, T13, Ta, T1w, TQ, T11, TI, T1j, Tx, T18, Tl;
312 V T1e, TD, T1h, Ts, T16, T2g, T2h, T14, T19, T1f, T1k, Tb, Tm, Ty, TJ;
313 V T25, T26, T28, T29, T1Y, T1Z, T21, T22;
314 {
315 V T4, T3, T2, T1, Tf, Te, Td, Tc, T1b, TU, TT, TS, TR, T12, T9;
316 V T8, T7, T6, T1v, TP, TO, TN, TM, T10, TH, TG, TF, TE, T1i, Tw;
317 V Tv, Tu, Tt, T17, Tk, Tj, Ti, Th, T1d, TC, TB, TA, Tz, T1g, Tr;
318 V Tq, Tp, To, T15;
319 T4 = LD(&(Rp[0]), ms, &(Rp[0]));
320 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
321 T3 = VCONJ(T2);
322 T1u = VADD(T4, T3);
323 T1 = LDW(&(W[0]));
324 T5 = VZMULIJ(T1, VSUB(T3, T4));
325 Tf = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
326 Td = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
327 Te = VCONJ(Td);
328 Tc = LDW(&(W[TWVL * 16]));
329 Tg = VZMULIJ(Tc, VSUB(Te, Tf));
330 T1b = LDW(&(W[TWVL * 14]));
331 T1c = VZMULJ(T1b, VADD(Te, Tf));
332 TU = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
333 TS = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
334 TT = VCONJ(TS);
335 TR = LDW(&(W[TWVL * 28]));
336 TV = VZMULIJ(TR, VSUB(TT, TU));
337 T12 = LDW(&(W[TWVL * 26]));
338 T13 = VZMULJ(T12, VADD(TT, TU));
339 T9 = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
340 T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
341 T8 = VCONJ(T7);
342 T6 = LDW(&(W[TWVL * 20]));
343 Ta = VZMULIJ(T6, VSUB(T8, T9));
344 T1v = LDW(&(W[TWVL * 18]));
345 T1w = VZMULJ(T1v, VADD(T9, T8));
346 TP = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
347 TN = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
348 TO = VCONJ(TN);
349 TM = LDW(&(W[TWVL * 8]));
350 TQ = VZMULIJ(TM, VSUB(TO, TP));
351 T10 = LDW(&(W[TWVL * 6]));
352 T11 = VZMULJ(T10, VADD(TO, TP));
353 TH = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
354 TF = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
355 TG = VCONJ(TF);
356 TE = LDW(&(W[TWVL * 4]));
357 TI = VZMULIJ(TE, VSUB(TG, TH));
358 T1i = LDW(&(W[TWVL * 2]));
359 T1j = VZMULJ(T1i, VADD(TG, TH));
360 Tw = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
361 Tu = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
362 Tv = VCONJ(Tu);
363 Tt = LDW(&(W[TWVL * 12]));
364 Tx = VZMULIJ(Tt, VSUB(Tv, Tw));
365 T17 = LDW(&(W[TWVL * 10]));
366 T18 = VZMULJ(T17, VADD(Tw, Tv));
367 Tk = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
368 Ti = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
369 Tj = VCONJ(Ti);
370 Th = LDW(&(W[TWVL * 36]));
371 Tl = VZMULIJ(Th, VSUB(Tj, Tk));
372 T1d = LDW(&(W[TWVL * 34]));
373 T1e = VZMULJ(T1d, VADD(Tj, Tk));
374 TC = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
375 TA = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
376 TB = VCONJ(TA);
377 Tz = LDW(&(W[TWVL * 24]));
378 TD = VZMULIJ(Tz, VSUB(TB, TC));
379 T1g = LDW(&(W[TWVL * 22]));
380 T1h = VZMULJ(T1g, VADD(TB, TC));
381 Tr = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
382 Tp = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
383 Tq = VCONJ(Tp);
384 To = LDW(&(W[TWVL * 32]));
385 Ts = VZMULIJ(To, VSUB(Tq, Tr));
386 T15 = LDW(&(W[TWVL * 30]));
387 T16 = VZMULJ(T15, VADD(Tr, Tq));
388 }
389 TW = VSUB(TQ, TV);
390 T1x = VSUB(T1u, T1w);
391 T2g = VADD(T1u, T1w);
392 T2h = VADD(TQ, TV);
393 T2i = VADD(T2g, T2h);
394 T2A = VSUB(T2g, T2h);
395 T14 = VSUB(T11, T13);
396 T19 = VSUB(T16, T18);
397 T1r = VADD(T14, T19);
398 T1f = VSUB(T1c, T1e);
399 T1k = VSUB(T1h, T1j);
400 T1s = VADD(T1f, T1k);
401 T1a = VSUB(T14, T19);
402 T1y = VADD(T1r, T1s);
403 T1l = VSUB(T1f, T1k);
404 Tb = VSUB(T5, Ta);
405 Tm = VSUB(Tg, Tl);
406 Tn = VADD(Tb, Tm);
407 Ty = VSUB(Ts, Tx);
408 TJ = VSUB(TD, TI);
409 TK = VADD(Ty, TJ);
410 TL = VADD(Tn, TK);
411 T1p = VSUB(Ty, TJ);
412 T1o = VSUB(Tb, Tm);
413 T25 = VADD(T1c, T1e);
414 T26 = VADD(TD, TI);
415 T27 = VADD(T25, T26);
416 T2t = VSUB(T25, T26);
417 T28 = VADD(Ts, Tx);
418 T29 = VADD(T1h, T1j);
419 T2a = VADD(T28, T29);
420 T2u = VSUB(T29, T28);
421 T2e = VADD(T27, T2a);
422 T2C = VADD(T2t, T2u);
423 T1Y = VADD(T11, T13);
424 T1Z = VADD(Tg, Tl);
425 T20 = VADD(T1Y, T1Z);
426 T2w = VSUB(T1Y, T1Z);
427 T21 = VADD(T5, Ta);
428 T22 = VADD(T16, T18);
429 T23 = VADD(T21, T22);
430 T2x = VSUB(T22, T21);
431 T2d = VADD(T20, T23);
432 T2B = VADD(T2w, T2x);
433 }
434 T1U = VADD(T1x, T1y);
435 T1V = VBYI(VADD(TW, TL));
436 T1W = VMUL(LDK(KP500000000), VSUB(T1U, T1V));
437 T1X = VCONJ(VMUL(LDK(KP500000000), VADD(T1V, T1U)));
438 ST(&(Rp[WS(rs, 5)]), T1W, ms, &(Rp[WS(rs, 1)]));
439 ST(&(Rm[WS(rs, 4)]), T1X, -ms, &(Rm[0]));
440 T2v = VSUB(T2t, T2u);
441 T2y = VSUB(T2w, T2x);
442 T2z = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T2y, VMUL(LDK(KP951056516), T2v))));
443 T2K = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T2y, VMUL(LDK(KP587785252), T2v))));
444 T2F = VMUL(LDK(KP279508497), VSUB(T2B, T2C));
445 T2D = VADD(T2B, T2C);
446 T2E = VFNMS(LDK(KP125000000), T2D, VMUL(LDK(KP500000000), T2A));
447 T2G = VSUB(T2E, T2F);
448 T2N = VCONJ(VMUL(LDK(KP500000000), VADD(T2A, T2D)));
449 T2J = VADD(T2F, T2E);
450 ST(&(Rm[WS(rs, 9)]), T2N, -ms, &(Rm[WS(rs, 1)]));
451 T2M = VCONJ(VADD(T2K, T2J));
452 ST(&(Rm[WS(rs, 5)]), T2M, -ms, &(Rm[WS(rs, 1)]));
453 T2H = VADD(T2z, T2G);
454 ST(&(Rp[WS(rs, 2)]), T2H, ms, &(Rp[0]));
455 T2I = VCONJ(VSUB(T2G, T2z));
456 ST(&(Rm[WS(rs, 1)]), T2I, -ms, &(Rm[WS(rs, 1)]));
457 T2L = VSUB(T2J, T2K);
458 ST(&(Rp[WS(rs, 6)]), T2L, ms, &(Rp[0]));
459 {
460 V T2c, T2p, T2l, T2s, T2o, T24, T2b, T2f, T2j, T2k, T2r, T2m, T2n, T2q, T1n;
461 V T1Q, T1E, T1K, T1B, T1R, T1F, T1N, T1m, T1J, TZ, T1I, TX, TY, T1q, T1M;
462 V T1A, T1L, T1t, T1z, T1C, T1S, T1T, T1D, T1G, T1O, T1P, T1H;
463 T24 = VSUB(T20, T23);
464 T2b = VSUB(T27, T2a);
465 T2c = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T24, VMUL(LDK(KP587785252), T2b))));
466 T2p = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T24, VMUL(LDK(KP951056516), T2b))));
467 T2f = VMUL(LDK(KP279508497), VSUB(T2d, T2e));
468 T2j = VADD(T2d, T2e);
469 T2k = VFNMS(LDK(KP125000000), T2j, VMUL(LDK(KP500000000), T2i));
470 T2l = VADD(T2f, T2k);
471 T2s = VMUL(LDK(KP500000000), VADD(T2i, T2j));
472 T2o = VSUB(T2k, T2f);
473 ST(&(Rp[0]), T2s, ms, &(Rp[0]));
474 T2r = VCONJ(VADD(T2p, T2o));
475 ST(&(Rm[WS(rs, 7)]), T2r, -ms, &(Rm[WS(rs, 1)]));
476 T2m = VADD(T2c, T2l);
477 ST(&(Rp[WS(rs, 4)]), T2m, ms, &(Rp[0]));
478 T2n = VCONJ(VSUB(T2l, T2c));
479 ST(&(Rm[WS(rs, 3)]), T2n, -ms, &(Rm[WS(rs, 1)]));
480 T2q = VSUB(T2o, T2p);
481 ST(&(Rp[WS(rs, 8)]), T2q, ms, &(Rp[0]));
482 T1m = VFMA(LDK(KP951056516), T1a, VMUL(LDK(KP587785252), T1l));
483 T1J = VFNMS(LDK(KP587785252), T1a, VMUL(LDK(KP951056516), T1l));
484 TX = VFMS(LDK(KP250000000), TL, TW);
485 TY = VMUL(LDK(KP559016994), VSUB(TK, Tn));
486 TZ = VADD(TX, TY);
487 T1I = VSUB(TY, TX);
488 T1n = VMUL(LDK(KP500000000), VBYI(VSUB(TZ, T1m)));
489 T1Q = VMUL(LDK(KP500000000), VBYI(VADD(T1I, T1J)));
490 T1E = VMUL(LDK(KP500000000), VBYI(VADD(TZ, T1m)));
491 T1K = VMUL(LDK(KP500000000), VBYI(VSUB(T1I, T1J)));
492 T1q = VFMA(LDK(KP475528258), T1o, VMUL(LDK(KP293892626), T1p));
493 T1M = VFNMS(LDK(KP293892626), T1o, VMUL(LDK(KP475528258), T1p));
494 T1t = VMUL(LDK(KP279508497), VSUB(T1r, T1s));
495 T1z = VFNMS(LDK(KP125000000), T1y, VMUL(LDK(KP500000000), T1x));
496 T1A = VADD(T1t, T1z);
497 T1L = VSUB(T1z, T1t);
498 T1B = VADD(T1q, T1A);
499 T1R = VADD(T1M, T1L);
500 T1F = VSUB(T1A, T1q);
501 T1N = VSUB(T1L, T1M);
502 T1C = VADD(T1n, T1B);
503 ST(&(Rp[WS(rs, 1)]), T1C, ms, &(Rp[WS(rs, 1)]));
504 T1S = VADD(T1Q, T1R);
505 ST(&(Rp[WS(rs, 7)]), T1S, ms, &(Rp[WS(rs, 1)]));
506 T1T = VCONJ(VSUB(T1R, T1Q));
507 ST(&(Rm[WS(rs, 6)]), T1T, -ms, &(Rm[0]));
508 T1D = VCONJ(VSUB(T1B, T1n));
509 ST(&(Rm[0]), T1D, -ms, &(Rm[0]));
510 T1G = VADD(T1E, T1F);
511 ST(&(Rp[WS(rs, 9)]), T1G, ms, &(Rp[WS(rs, 1)]));
512 T1O = VADD(T1K, T1N);
513 ST(&(Rp[WS(rs, 3)]), T1O, ms, &(Rp[WS(rs, 1)]));
514 T1P = VCONJ(VSUB(T1N, T1K));
515 ST(&(Rm[WS(rs, 2)]), T1P, -ms, &(Rm[0]));
516 T1H = VCONJ(VSUB(T1F, T1E));
517 ST(&(Rm[WS(rs, 8)]), T1H, -ms, &(Rm[0]));
518 }
519 }
520 }
521 VLEAVE();
522 }
523
524 static const tw_instr twinstr[] = {
525 VTW(1, 1),
526 VTW(1, 2),
527 VTW(1, 3),
528 VTW(1, 4),
529 VTW(1, 5),
530 VTW(1, 6),
531 VTW(1, 7),
532 VTW(1, 8),
533 VTW(1, 9),
534 VTW(1, 10),
535 VTW(1, 11),
536 VTW(1, 12),
537 VTW(1, 13),
538 VTW(1, 14),
539 VTW(1, 15),
540 VTW(1, 16),
541 VTW(1, 17),
542 VTW(1, 18),
543 VTW(1, 19),
544 {TW_NEXT, VL, 0}
545 };
546
547 static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cfdftv_20"), twinstr, &GENUS, {131, 65, 12, 0} };
548
549 void XSIMD(codelet_hc2cfdftv_20) (planner *p) {
550 X(khc2c_register) (p, hc2cfdftv_20, &desc, HC2C_VIA_DFT);
551 }
552 #endif /* HAVE_FMA */