comparison src/fftw-3.3.3/rdft/simd/common/hc2cfdftv_32.c @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Sun Nov 25 07:42:29 EST 2012 */
23
24 #include "codelet-rdft.h"
25
26 #ifdef HAVE_FMA
27
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include hc2cfv.h */
29
30 /*
31 * This function contains 249 FP additions, 224 FP multiplications,
32 * (or, 119 additions, 94 multiplications, 130 fused multiply/add),
33 * 167 stack variables, 8 constants, and 64 memory accesses
34 */
35 #include "hc2cfv.h"
36
37 static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
40 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
41 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
43 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
44 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
46 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
47 {
48 INT m;
49 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
50 V T2m, T2b, T2c, T2d, T2v, T2r, T20, T2i, T2n, T2e, T2o, T2u, T2j, T2f, T2t;
51 V T2s, T2x, T2w, T2l, T2k, T2h, T2g;
52 {
53 V T41, T3B, T40, T3a, T2J, T27, T2y, Ts, T2C, T1X, T2B, T1Q, T3F, T3w, T4l;
54 V T49, T1b, T1s, T3c, TB, T1f, T3g, T44, T1l, T3k, T3o, T4b, T28, T14, T1d;
55 V T3b, TK;
56 {
57 V T1V, T1E, T3A, Th, T3v, T47, T1J, T3q, T8, T38, T25, T39, T3z, Tq, T1O;
58 V T3r, T3, T7, T3u, T24, T22, T3t, T1I, Tn, T1G, To, Tm, T1K, Tl, T1N;
59 V Tp, T1L, TU, T3f, T3m, T13, T3e, T3n, T1i, TH, TI, T1k, TG, TF, T1c;
60 V TJ;
61 {
62 V T1x, T1y, T1U, T1B, T1S, T1C, T1A, T23, T21, T1z, T1, T2, T1T, T5, T6;
63 V T1R, T4, T1w, Ta, Tb, T1H, Te, Tf, Td, Tc, T1F, T9, T1D, Tj, Tk;
64 V Ti, Tg, T1M;
65 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
66 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
67 T1T = LDW(&(W[0]));
68 T5 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
69 T6 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
70 T1R = LDW(&(W[TWVL * 32]));
71 T4 = LDW(&(W[TWVL * 30]));
72 T1x = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
73 T1y = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
74 T3 = VFMACONJ(T2, T1);
75 T1U = VZMULIJ(T1T, VFNMSCONJ(T2, T1));
76 T1w = LDW(&(W[TWVL * 48]));
77 T1B = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
78 T1S = VZMULIJ(T1R, VFNMSCONJ(T6, T5));
79 T7 = VZMULJ(T4, VFMACONJ(T6, T5));
80 T1C = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
81 T1A = LDW(&(W[TWVL * 16]));
82 T23 = LDW(&(W[TWVL * 46]));
83 T21 = LDW(&(W[TWVL * 14]));
84 T1z = VZMULIJ(T1w, VFNMSCONJ(T1y, T1x));
85 Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
86 T3u = VADD(T1U, T1S);
87 T1V = VSUB(T1S, T1U);
88 Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
89 T9 = LDW(&(W[TWVL * 6]));
90 T1D = VZMULIJ(T1A, VFNMSCONJ(T1C, T1B));
91 T24 = VZMULJ(T23, VFMACONJ(T1y, T1x));
92 T22 = VZMULJ(T21, VFMACONJ(T1C, T1B));
93 T1H = LDW(&(W[TWVL * 8]));
94 Te = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
95 Tf = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
96 Td = LDW(&(W[TWVL * 38]));
97 Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
98 T1E = VSUB(T1z, T1D);
99 T3t = VADD(T1D, T1z);
100 T1F = LDW(&(W[TWVL * 40]));
101 Tj = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
102 T1I = VZMULIJ(T1H, VFNMSCONJ(Tb, Ta));
103 Tk = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
104 Ti = LDW(&(W[TWVL * 54]));
105 Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
106 T1M = LDW(&(W[TWVL * 56]));
107 Tn = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
108 T1G = VZMULIJ(T1F, VFNMSCONJ(Tf, Te));
109 To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
110 Tm = LDW(&(W[TWVL * 22]));
111 T1K = LDW(&(W[TWVL * 24]));
112 Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
113 T3A = VADD(Tc, Tg);
114 Th = VSUB(Tc, Tg);
115 T1N = VZMULIJ(T1M, VFNMSCONJ(Tk, Tj));
116 }
117 T3v = VSUB(T3t, T3u);
118 T47 = VADD(T3u, T3t);
119 T1J = VSUB(T1G, T1I);
120 T3q = VADD(T1I, T1G);
121 Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
122 T1L = VZMULIJ(T1K, VFNMSCONJ(To, Tn));
123 T8 = VSUB(T3, T7);
124 T38 = VADD(T3, T7);
125 T25 = VSUB(T22, T24);
126 T39 = VADD(T22, T24);
127 T3z = VADD(Tl, Tp);
128 Tq = VSUB(Tl, Tp);
129 T1O = VSUB(T1L, T1N);
130 T3r = VADD(T1N, T1L);
131 {
132 V T10, T11, TZ, T1o, TY, T1r, TN, TO, TM, T19, TR, TS, TQ, T17, T26;
133 V Tr, T1W, T1P, T3s, T48, TW, TX, TP, T1a, TV, T1q, TT, T18, Ty, Tz;
134 V Tx, Tw, T1j, Tu, T12, T1p, Tv, Tt, T1h, TD, TA, TE, TC, T1e;
135 TN = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
136 TO = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
137 T41 = VADD(T3A, T3z);
138 T3B = VSUB(T3z, T3A);
139 T26 = VSUB(Tq, Th);
140 Tr = VADD(Th, Tq);
141 T1W = VADD(T1J, T1O);
142 T1P = VSUB(T1J, T1O);
143 T3s = VSUB(T3q, T3r);
144 T48 = VADD(T3q, T3r);
145 T40 = VADD(T38, T39);
146 T3a = VSUB(T38, T39);
147 T2J = VFNMS(LDK(KP707106781), T26, T25);
148 T27 = VFMA(LDK(KP707106781), T26, T25);
149 T2y = VFMA(LDK(KP707106781), Tr, T8);
150 Ts = VFNMS(LDK(KP707106781), Tr, T8);
151 T2C = VFMA(LDK(KP707106781), T1W, T1V);
152 T1X = VFNMS(LDK(KP707106781), T1W, T1V);
153 T2B = VFMA(LDK(KP707106781), T1P, T1E);
154 T1Q = VFNMS(LDK(KP707106781), T1P, T1E);
155 T3F = VFMA(LDK(KP414213562), T3s, T3v);
156 T3w = VFNMS(LDK(KP414213562), T3v, T3s);
157 T4l = VSUB(T48, T47);
158 T49 = VADD(T47, T48);
159 TM = LDW(&(W[TWVL * 10]));
160 T19 = LDW(&(W[TWVL * 12]));
161 TR = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
162 TS = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
163 TQ = LDW(&(W[TWVL * 42]));
164 T17 = LDW(&(W[TWVL * 44]));
165 TW = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
166 TX = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
167 TP = VZMULJ(TM, VFMACONJ(TO, TN));
168 T1a = VZMULIJ(T19, VFNMSCONJ(TO, TN));
169 TV = LDW(&(W[TWVL * 58]));
170 T1q = LDW(&(W[TWVL * 60]));
171 TT = VZMULJ(TQ, VFMACONJ(TS, TR));
172 T18 = VZMULIJ(T17, VFNMSCONJ(TS, TR));
173 T10 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
174 T11 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
175 TZ = LDW(&(W[TWVL * 26]));
176 T1o = LDW(&(W[TWVL * 28]));
177 TY = VZMULJ(TV, VFMACONJ(TX, TW));
178 T1r = VZMULIJ(T1q, VFNMSCONJ(TX, TW));
179 TU = VSUB(TP, TT);
180 T3f = VADD(TP, TT);
181 T1b = VSUB(T18, T1a);
182 T3m = VADD(T1a, T18);
183 Tu = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
184 T12 = VZMULJ(TZ, VFMACONJ(T11, T10));
185 T1p = VZMULIJ(T1o, VFNMSCONJ(T11, T10));
186 Tv = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
187 Tt = LDW(&(W[TWVL * 18]));
188 T1h = LDW(&(W[TWVL * 20]));
189 Ty = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
190 Tz = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
191 Tx = LDW(&(W[TWVL * 50]));
192 T13 = VSUB(TY, T12);
193 T3e = VADD(TY, T12);
194 T1s = VSUB(T1p, T1r);
195 T3n = VADD(T1r, T1p);
196 Tw = VZMULJ(Tt, VFMACONJ(Tv, Tu));
197 T1i = VZMULIJ(T1h, VFNMSCONJ(Tv, Tu));
198 T1j = LDW(&(W[TWVL * 52]));
199 TD = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
200 TA = VZMULJ(Tx, VFMACONJ(Tz, Ty));
201 TE = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
202 TC = LDW(&(W[TWVL * 2]));
203 T1e = LDW(&(W[TWVL * 4]));
204 TH = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
205 TI = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
206 T1k = VZMULIJ(T1j, VFNMSCONJ(Tz, Ty));
207 TG = LDW(&(W[TWVL * 34]));
208 T3c = VADD(Tw, TA);
209 TB = VSUB(Tw, TA);
210 TF = VZMULJ(TC, VFMACONJ(TE, TD));
211 T1f = VZMULIJ(T1e, VFNMSCONJ(TE, TD));
212 T1c = LDW(&(W[TWVL * 36]));
213 }
214 T3g = VSUB(T3e, T3f);
215 T44 = VADD(T3e, T3f);
216 T1l = VSUB(T1i, T1k);
217 T3k = VADD(T1i, T1k);
218 TJ = VZMULJ(TG, VFMACONJ(TI, TH));
219 T3o = VSUB(T3m, T3n);
220 T4b = VADD(T3n, T3m);
221 T28 = VFMA(LDK(KP414213562), TU, T13);
222 T14 = VFNMS(LDK(KP414213562), T13, TU);
223 T1d = VZMULIJ(T1c, VFNMSCONJ(TI, TH));
224 T3b = VADD(TF, TJ);
225 TK = VSUB(TF, TJ);
226 }
227 {
228 V T4k, T4p, T2z, T2a, T2K, T15, T2E, T1n, T2F, T1u, T4c, T3R, T3D, T3i, T3O;
229 V T46, T4g, T3G, T3P, T3S, T3x, T4q, T4n, T42, T1g, T3j, T3E, T3p, T4m, T3d;
230 V T43, T29, TL, T1m, T1t, T3l, T4a, T3C, T3h, T45, T3Q, T3W, T4d, T4h, T3H;
231 V T3L, T3y, T3K, T4r, T4v, T4o, T4u, T4j, T4i, T4e, T4f, T3N, T3M, T3I, T3J;
232 V T4x, T4w, T4s, T4t;
233 T42 = VADD(T40, T41);
234 T4k = VSUB(T40, T41);
235 T1g = VSUB(T1d, T1f);
236 T3j = VADD(T1f, T1d);
237 T3d = VSUB(T3b, T3c);
238 T43 = VADD(T3b, T3c);
239 T29 = VFNMS(LDK(KP414213562), TB, TK);
240 TL = VFMA(LDK(KP414213562), TK, TB);
241 T1m = VSUB(T1g, T1l);
242 T1t = VADD(T1g, T1l);
243 T3l = VSUB(T3j, T3k);
244 T4a = VADD(T3j, T3k);
245 T3C = VSUB(T3g, T3d);
246 T3h = VADD(T3d, T3g);
247 T45 = VADD(T43, T44);
248 T4p = VSUB(T44, T43);
249 T2z = VADD(T29, T28);
250 T2a = VSUB(T28, T29);
251 T2K = VADD(TL, T14);
252 T15 = VSUB(TL, T14);
253 T2E = VFMA(LDK(KP707106781), T1m, T1b);
254 T1n = VFNMS(LDK(KP707106781), T1m, T1b);
255 T2F = VFMA(LDK(KP707106781), T1t, T1s);
256 T1u = VFNMS(LDK(KP707106781), T1t, T1s);
257 T3E = VFNMS(LDK(KP414213562), T3l, T3o);
258 T3p = VFMA(LDK(KP414213562), T3o, T3l);
259 T4m = VSUB(T4a, T4b);
260 T4c = VADD(T4a, T4b);
261 T3R = VFMA(LDK(KP707106781), T3C, T3B);
262 T3D = VFNMS(LDK(KP707106781), T3C, T3B);
263 T3i = VFNMS(LDK(KP707106781), T3h, T3a);
264 T3O = VFMA(LDK(KP707106781), T3h, T3a);
265 T46 = VSUB(T42, T45);
266 T4g = VADD(T42, T45);
267 T3G = VSUB(T3E, T3F);
268 T3P = VADD(T3F, T3E);
269 T3S = VADD(T3w, T3p);
270 T3x = VSUB(T3p, T3w);
271 T4q = VSUB(T4m, T4l);
272 T4n = VADD(T4l, T4m);
273 T4d = VSUB(T49, T4c);
274 T4h = VADD(T49, T4c);
275 T3H = VFNMS(LDK(KP923879532), T3G, T3D);
276 T3L = VFMA(LDK(KP923879532), T3G, T3D);
277 T3y = VFMA(LDK(KP923879532), T3x, T3i);
278 T3K = VFNMS(LDK(KP923879532), T3x, T3i);
279 T4r = VFMA(LDK(KP707106781), T4q, T4p);
280 T4v = VFNMS(LDK(KP707106781), T4q, T4p);
281 T4o = VFMA(LDK(KP707106781), T4n, T4k);
282 T4u = VFNMS(LDK(KP707106781), T4n, T4k);
283 T3Q = VFMA(LDK(KP923879532), T3P, T3O);
284 T3W = VFNMS(LDK(KP923879532), T3P, T3O);
285 T4j = VCONJ(VMUL(LDK(KP500000000), VADD(T4h, T4g)));
286 T4i = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
287 T4e = VMUL(LDK(KP500000000), VFMAI(T4d, T46));
288 T4f = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4d, T46)));
289 T3N = VMUL(LDK(KP500000000), VFMAI(T3L, T3K));
290 T3M = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3L, T3K)));
291 T3I = VMUL(LDK(KP500000000), VFNMSI(T3H, T3y));
292 T3J = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3H, T3y)));
293 T4x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T4v, T4u)));
294 T4w = VMUL(LDK(KP500000000), VFNMSI(T4v, T4u));
295 T4s = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4r, T4o)));
296 T4t = VMUL(LDK(KP500000000), VFMAI(T4r, T4o));
297 ST(&(Rp[0]), T4i, ms, &(Rp[0]));
298 ST(&(Rm[WS(rs, 15)]), T4j, -ms, &(Rm[WS(rs, 1)]));
299 ST(&(Rm[WS(rs, 7)]), T4f, -ms, &(Rm[WS(rs, 1)]));
300 ST(&(Rp[WS(rs, 8)]), T4e, ms, &(Rp[0]));
301 ST(&(Rm[WS(rs, 9)]), T3M, -ms, &(Rm[WS(rs, 1)]));
302 ST(&(Rp[WS(rs, 10)]), T3N, ms, &(Rp[0]));
303 ST(&(Rm[WS(rs, 5)]), T3J, -ms, &(Rm[WS(rs, 1)]));
304 ST(&(Rp[WS(rs, 6)]), T3I, ms, &(Rp[0]));
305 ST(&(Rp[WS(rs, 12)]), T4w, ms, &(Rp[0]));
306 ST(&(Rm[WS(rs, 11)]), T4x, -ms, &(Rm[WS(rs, 1)]));
307 ST(&(Rp[WS(rs, 4)]), T4t, ms, &(Rp[0]));
308 ST(&(Rm[WS(rs, 3)]), T4s, -ms, &(Rm[WS(rs, 1)]));
309 {
310 V T2A, T2W, T2L, T2Z, T2D, T2N, T2M, T2G, T3T, T3X, T16, T2p, T1v, T35, T31;
311 V T2I, T2S, T34, T2Y, T2P, T2T, T1Y, T2H, T30, T3Z, T3Y, T3U, T3V, T2O, T2X;
312 V T32, T33, T36, T37, T2U, T2V, T2Q, T2R, T1Z, T2q;
313 T2A = VFNMS(LDK(KP923879532), T2z, T2y);
314 T2W = VFMA(LDK(KP923879532), T2z, T2y);
315 T2L = VFNMS(LDK(KP923879532), T2K, T2J);
316 T2Z = VFMA(LDK(KP923879532), T2K, T2J);
317 T2D = VFMA(LDK(KP198912367), T2C, T2B);
318 T2N = VFNMS(LDK(KP198912367), T2B, T2C);
319 T2M = VFMA(LDK(KP198912367), T2E, T2F);
320 T2G = VFNMS(LDK(KP198912367), T2F, T2E);
321 T3T = VFMA(LDK(KP923879532), T3S, T3R);
322 T3X = VFNMS(LDK(KP923879532), T3S, T3R);
323 T16 = VFNMS(LDK(KP923879532), T15, Ts);
324 T2m = VFMA(LDK(KP923879532), T15, Ts);
325 T2H = VSUB(T2D, T2G);
326 T30 = VADD(T2D, T2G);
327 T2b = VFNMS(LDK(KP923879532), T2a, T27);
328 T2p = VFMA(LDK(KP923879532), T2a, T27);
329 T1v = VFMA(LDK(KP668178637), T1u, T1n);
330 T2c = VFNMS(LDK(KP668178637), T1n, T1u);
331 T3Z = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3X, T3W)));
332 T3Y = VMUL(LDK(KP500000000), VFNMSI(T3X, T3W));
333 T3U = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3T, T3Q)));
334 T3V = VMUL(LDK(KP500000000), VFMAI(T3T, T3Q));
335 T2O = VSUB(T2M, T2N);
336 T2X = VADD(T2N, T2M);
337 T35 = VFNMS(LDK(KP980785280), T30, T2Z);
338 T31 = VFMA(LDK(KP980785280), T30, T2Z);
339 T2I = VFMA(LDK(KP980785280), T2H, T2A);
340 T2S = VFNMS(LDK(KP980785280), T2H, T2A);
341 ST(&(Rp[WS(rs, 14)]), T3Y, ms, &(Rp[0]));
342 ST(&(Rm[WS(rs, 13)]), T3Z, -ms, &(Rm[WS(rs, 1)]));
343 ST(&(Rp[WS(rs, 2)]), T3V, ms, &(Rp[0]));
344 ST(&(Rm[WS(rs, 1)]), T3U, -ms, &(Rm[WS(rs, 1)]));
345 T34 = VFNMS(LDK(KP980785280), T2X, T2W);
346 T2Y = VFMA(LDK(KP980785280), T2X, T2W);
347 T2P = VFMA(LDK(KP980785280), T2O, T2L);
348 T2T = VFNMS(LDK(KP980785280), T2O, T2L);
349 T2d = VFMA(LDK(KP668178637), T1Q, T1X);
350 T1Y = VFNMS(LDK(KP668178637), T1X, T1Q);
351 T32 = VMUL(LDK(KP500000000), VFNMSI(T31, T2Y));
352 T33 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T31, T2Y)));
353 T36 = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T35, T34)));
354 T37 = VMUL(LDK(KP500000000), VFMAI(T35, T34));
355 T2U = VMUL(LDK(KP500000000), VFNMSI(T2T, T2S));
356 T2V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2T, T2S)));
357 T2Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2P, T2I)));
358 T2R = VMUL(LDK(KP500000000), VFMAI(T2P, T2I));
359 T1Z = VSUB(T1v, T1Y);
360 T2q = VADD(T1Y, T1v);
361 ST(&(Rm[0]), T33, -ms, &(Rm[0]));
362 ST(&(Rp[WS(rs, 1)]), T32, ms, &(Rp[WS(rs, 1)]));
363 ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)]));
364 ST(&(Rm[WS(rs, 14)]), T36, -ms, &(Rm[0]));
365 ST(&(Rm[WS(rs, 8)]), T2V, -ms, &(Rm[0]));
366 ST(&(Rp[WS(rs, 9)]), T2U, ms, &(Rp[WS(rs, 1)]));
367 ST(&(Rp[WS(rs, 7)]), T2R, ms, &(Rp[WS(rs, 1)]));
368 ST(&(Rm[WS(rs, 6)]), T2Q, -ms, &(Rm[0]));
369 T2v = VFNMS(LDK(KP831469612), T2q, T2p);
370 T2r = VFMA(LDK(KP831469612), T2q, T2p);
371 T20 = VFMA(LDK(KP831469612), T1Z, T16);
372 T2i = VFNMS(LDK(KP831469612), T1Z, T16);
373 }
374 }
375 }
376 T2n = VADD(T2d, T2c);
377 T2e = VSUB(T2c, T2d);
378 T2o = VFMA(LDK(KP831469612), T2n, T2m);
379 T2u = VFNMS(LDK(KP831469612), T2n, T2m);
380 T2j = VFMA(LDK(KP831469612), T2e, T2b);
381 T2f = VFNMS(LDK(KP831469612), T2e, T2b);
382 T2t = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2r, T2o)));
383 T2s = VMUL(LDK(KP500000000), VFMAI(T2r, T2o));
384 T2x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2v, T2u)));
385 T2w = VMUL(LDK(KP500000000), VFNMSI(T2v, T2u));
386 T2l = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2j, T2i)));
387 T2k = VMUL(LDK(KP500000000), VFMAI(T2j, T2i));
388 T2h = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2f, T20)));
389 T2g = VMUL(LDK(KP500000000), VFNMSI(T2f, T20));
390 ST(&(Rm[WS(rs, 2)]), T2t, -ms, &(Rm[0]));
391 ST(&(Rp[WS(rs, 3)]), T2s, ms, &(Rp[WS(rs, 1)]));
392 ST(&(Rm[WS(rs, 12)]), T2x, -ms, &(Rm[0]));
393 ST(&(Rp[WS(rs, 13)]), T2w, ms, &(Rp[WS(rs, 1)]));
394 ST(&(Rm[WS(rs, 10)]), T2l, -ms, &(Rm[0]));
395 ST(&(Rp[WS(rs, 11)]), T2k, ms, &(Rp[WS(rs, 1)]));
396 ST(&(Rm[WS(rs, 4)]), T2h, -ms, &(Rm[0]));
397 ST(&(Rp[WS(rs, 5)]), T2g, ms, &(Rp[WS(rs, 1)]));
398 }
399 }
400 VLEAVE();
401 }
402
403 static const tw_instr twinstr[] = {
404 VTW(1, 1),
405 VTW(1, 2),
406 VTW(1, 3),
407 VTW(1, 4),
408 VTW(1, 5),
409 VTW(1, 6),
410 VTW(1, 7),
411 VTW(1, 8),
412 VTW(1, 9),
413 VTW(1, 10),
414 VTW(1, 11),
415 VTW(1, 12),
416 VTW(1, 13),
417 VTW(1, 14),
418 VTW(1, 15),
419 VTW(1, 16),
420 VTW(1, 17),
421 VTW(1, 18),
422 VTW(1, 19),
423 VTW(1, 20),
424 VTW(1, 21),
425 VTW(1, 22),
426 VTW(1, 23),
427 VTW(1, 24),
428 VTW(1, 25),
429 VTW(1, 26),
430 VTW(1, 27),
431 VTW(1, 28),
432 VTW(1, 29),
433 VTW(1, 30),
434 VTW(1, 31),
435 {TW_NEXT, VL, 0}
436 };
437
438 static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cfdftv_32"), twinstr, &GENUS, {119, 94, 130, 0} };
439
440 void XSIMD(codelet_hc2cfdftv_32) (planner *p) {
441 X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
442 }
443 #else /* HAVE_FMA */
444
445 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include hc2cfv.h */
446
447 /*
448 * This function contains 249 FP additions, 133 FP multiplications,
449 * (or, 233 additions, 117 multiplications, 16 fused multiply/add),
450 * 130 stack variables, 9 constants, and 64 memory accesses
451 */
452 #include "hc2cfv.h"
453
454 static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
455 {
456 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
457 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
458 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
459 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
460 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
461 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
462 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
463 DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
464 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
465 {
466 INT m;
467 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
468 V Ta, T2m, Tx, T2h, T3R, T4h, T3q, T4g, T3B, T4n, T3E, T4o, T1B, T2S, T1O;
469 V T2R, TV, T2p, T1i, T2o, T3L, T4q, T3I, T4r, T3w, T4k, T3t, T4j, T26, T2V;
470 V T2d, T2U;
471 {
472 V T4, T1m, T1H, T2j, T1M, T2l, T9, T1o, Tf, T1r, Tq, T1w, Tv, T1y, Tk;
473 V T1t, Tl, Tw, T3P, T3Q, T3o, T3p, T3z, T3A, T3C, T3D, T1p, T1N, T1A, T1C;
474 V T1u, T1z;
475 {
476 V T1, T3, T2, T1l, T1G, T1F, T1E, T1D, T2i, T1L, T1K, T1J, T1I, T2k, T6;
477 V T8, T7, T5, T1n, Tc, Te, Td, Tb, T1q, Tn, Tp, To, Tm, T1v, Ts;
478 V Tu, Tt, Tr, T1x, Th, Tj, Ti, Tg, T1s;
479 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
480 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
481 T3 = VCONJ(T2);
482 T4 = VADD(T1, T3);
483 T1l = LDW(&(W[0]));
484 T1m = VZMULIJ(T1l, VSUB(T3, T1));
485 T1G = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
486 T1E = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
487 T1F = VCONJ(T1E);
488 T1D = LDW(&(W[TWVL * 16]));
489 T1H = VZMULIJ(T1D, VSUB(T1F, T1G));
490 T2i = LDW(&(W[TWVL * 14]));
491 T2j = VZMULJ(T2i, VADD(T1G, T1F));
492 T1L = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
493 T1J = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
494 T1K = VCONJ(T1J);
495 T1I = LDW(&(W[TWVL * 48]));
496 T1M = VZMULIJ(T1I, VSUB(T1K, T1L));
497 T2k = LDW(&(W[TWVL * 46]));
498 T2l = VZMULJ(T2k, VADD(T1L, T1K));
499 T6 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
500 T7 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
501 T8 = VCONJ(T7);
502 T5 = LDW(&(W[TWVL * 30]));
503 T9 = VZMULJ(T5, VADD(T6, T8));
504 T1n = LDW(&(W[TWVL * 32]));
505 T1o = VZMULIJ(T1n, VSUB(T8, T6));
506 Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
507 Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
508 Te = VCONJ(Td);
509 Tb = LDW(&(W[TWVL * 6]));
510 Tf = VZMULJ(Tb, VADD(Tc, Te));
511 T1q = LDW(&(W[TWVL * 8]));
512 T1r = VZMULIJ(T1q, VSUB(Te, Tc));
513 Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
514 To = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
515 Tp = VCONJ(To);
516 Tm = LDW(&(W[TWVL * 54]));
517 Tq = VZMULJ(Tm, VADD(Tn, Tp));
518 T1v = LDW(&(W[TWVL * 56]));
519 T1w = VZMULIJ(T1v, VSUB(Tp, Tn));
520 Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
521 Tt = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
522 Tu = VCONJ(Tt);
523 Tr = LDW(&(W[TWVL * 22]));
524 Tv = VZMULJ(Tr, VADD(Ts, Tu));
525 T1x = LDW(&(W[TWVL * 24]));
526 T1y = VZMULIJ(T1x, VSUB(Tu, Ts));
527 Th = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
528 Ti = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
529 Tj = VCONJ(Ti);
530 Tg = LDW(&(W[TWVL * 38]));
531 Tk = VZMULJ(Tg, VADD(Th, Tj));
532 T1s = LDW(&(W[TWVL * 40]));
533 T1t = VZMULIJ(T1s, VSUB(Tj, Th));
534 }
535 Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
536 T2m = VSUB(T2j, T2l);
537 Tl = VSUB(Tf, Tk);
538 Tw = VSUB(Tq, Tv);
539 Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
540 T2h = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
541 T3P = VADD(Tq, Tv);
542 T3Q = VADD(Tf, Tk);
543 T3R = VSUB(T3P, T3Q);
544 T4h = VADD(T3Q, T3P);
545 T3o = VADD(T4, T9);
546 T3p = VADD(T2j, T2l);
547 T3q = VMUL(LDK(KP500000000), VSUB(T3o, T3p));
548 T4g = VADD(T3o, T3p);
549 T3z = VADD(T1m, T1o);
550 T3A = VADD(T1H, T1M);
551 T3B = VSUB(T3z, T3A);
552 T4n = VADD(T3z, T3A);
553 T3C = VADD(T1w, T1y);
554 T3D = VADD(T1r, T1t);
555 T3E = VSUB(T3C, T3D);
556 T4o = VADD(T3D, T3C);
557 T1p = VSUB(T1m, T1o);
558 T1N = VSUB(T1H, T1M);
559 T1u = VSUB(T1r, T1t);
560 T1z = VSUB(T1w, T1y);
561 T1A = VMUL(LDK(KP707106781), VADD(T1u, T1z));
562 T1C = VMUL(LDK(KP707106781), VSUB(T1z, T1u));
563 T1B = VADD(T1p, T1A);
564 T2S = VADD(T1N, T1C);
565 T1O = VSUB(T1C, T1N);
566 T2R = VSUB(T1p, T1A);
567 }
568 {
569 V TD, T1R, T1b, T29, T1g, T2b, TI, T1T, TO, T1Y, T10, T22, T15, T24, TT;
570 V T1W, TJ, TU, T16, T1h, T3J, T3K, T3G, T3H, T3u, T3v, T3r, T3s, T25, T2c;
571 V T20, T27, T1U, T1Z;
572 {
573 V TA, TC, TB, Tz, T1Q, T18, T1a, T19, T17, T28, T1d, T1f, T1e, T1c, T2a;
574 V TF, TH, TG, TE, T1S, TL, TN, TM, TK, T1X, TX, TZ, TY, TW, T21;
575 V T12, T14, T13, T11, T23, TQ, TS, TR, TP, T1V;
576 TA = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
577 TB = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
578 TC = VCONJ(TB);
579 Tz = LDW(&(W[TWVL * 2]));
580 TD = VZMULJ(Tz, VADD(TA, TC));
581 T1Q = LDW(&(W[TWVL * 4]));
582 T1R = VZMULIJ(T1Q, VSUB(TC, TA));
583 T18 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
584 T19 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
585 T1a = VCONJ(T19);
586 T17 = LDW(&(W[TWVL * 10]));
587 T1b = VZMULJ(T17, VADD(T18, T1a));
588 T28 = LDW(&(W[TWVL * 12]));
589 T29 = VZMULIJ(T28, VSUB(T1a, T18));
590 T1d = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
591 T1e = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
592 T1f = VCONJ(T1e);
593 T1c = LDW(&(W[TWVL * 42]));
594 T1g = VZMULJ(T1c, VADD(T1d, T1f));
595 T2a = LDW(&(W[TWVL * 44]));
596 T2b = VZMULIJ(T2a, VSUB(T1f, T1d));
597 TF = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
598 TG = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
599 TH = VCONJ(TG);
600 TE = LDW(&(W[TWVL * 34]));
601 TI = VZMULJ(TE, VADD(TF, TH));
602 T1S = LDW(&(W[TWVL * 36]));
603 T1T = VZMULIJ(T1S, VSUB(TH, TF));
604 TL = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
605 TM = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
606 TN = VCONJ(TM);
607 TK = LDW(&(W[TWVL * 18]));
608 TO = VZMULJ(TK, VADD(TL, TN));
609 T1X = LDW(&(W[TWVL * 20]));
610 T1Y = VZMULIJ(T1X, VSUB(TN, TL));
611 TX = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
612 TY = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
613 TZ = VCONJ(TY);
614 TW = LDW(&(W[TWVL * 58]));
615 T10 = VZMULJ(TW, VADD(TX, TZ));
616 T21 = LDW(&(W[TWVL * 60]));
617 T22 = VZMULIJ(T21, VSUB(TZ, TX));
618 T12 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
619 T13 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
620 T14 = VCONJ(T13);
621 T11 = LDW(&(W[TWVL * 26]));
622 T15 = VZMULJ(T11, VADD(T12, T14));
623 T23 = LDW(&(W[TWVL * 28]));
624 T24 = VZMULIJ(T23, VSUB(T14, T12));
625 TQ = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
626 TR = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
627 TS = VCONJ(TR);
628 TP = LDW(&(W[TWVL * 50]));
629 TT = VZMULJ(TP, VADD(TQ, TS));
630 T1V = LDW(&(W[TWVL * 52]));
631 T1W = VZMULIJ(T1V, VSUB(TS, TQ));
632 }
633 TJ = VSUB(TD, TI);
634 TU = VSUB(TO, TT);
635 TV = VFNMS(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TJ));
636 T2p = VFMA(LDK(KP382683432), TJ, VMUL(LDK(KP923879532), TU));
637 T16 = VSUB(T10, T15);
638 T1h = VSUB(T1b, T1g);
639 T1i = VFMA(LDK(KP923879532), T16, VMUL(LDK(KP382683432), T1h));
640 T2o = VFNMS(LDK(KP923879532), T1h, VMUL(LDK(KP382683432), T16));
641 T3J = VADD(T1Y, T1W);
642 T3K = VADD(T1R, T1T);
643 T3L = VSUB(T3J, T3K);
644 T4q = VADD(T3K, T3J);
645 T3G = VADD(T22, T24);
646 T3H = VADD(T29, T2b);
647 T3I = VSUB(T3G, T3H);
648 T4r = VADD(T3G, T3H);
649 T3u = VADD(T10, T15);
650 T3v = VADD(T1b, T1g);
651 T3w = VSUB(T3u, T3v);
652 T4k = VADD(T3u, T3v);
653 T3r = VADD(TD, TI);
654 T3s = VADD(TO, TT);
655 T3t = VSUB(T3r, T3s);
656 T4j = VADD(T3r, T3s);
657 T25 = VSUB(T22, T24);
658 T2c = VSUB(T29, T2b);
659 T1U = VSUB(T1R, T1T);
660 T1Z = VSUB(T1W, T1Y);
661 T20 = VMUL(LDK(KP707106781), VADD(T1U, T1Z));
662 T27 = VMUL(LDK(KP707106781), VSUB(T1Z, T1U));
663 T26 = VADD(T20, T25);
664 T2V = VADD(T27, T2c);
665 T2d = VSUB(T27, T2c);
666 T2U = VSUB(T25, T20);
667 }
668 {
669 V T4m, T4w, T4t, T4x, T4i, T4l, T4p, T4s, T4u, T4z, T4v, T4y, T4E, T4L, T4H;
670 V T4K, T4A, T4F, T4D, T4G, T4B, T4C, T4I, T4N, T4J, T4M, T3O, T4c, T4d, T3X;
671 V T40, T46, T49, T41, T3y, T47, T3T, T45, T3N, T44, T3W, T48, T3x, T3S, T3F;
672 V T3M, T3U, T3V, T3Y, T4e, T4f, T3Z, T42, T4a, T4b, T43;
673 T4i = VADD(T4g, T4h);
674 T4l = VADD(T4j, T4k);
675 T4m = VADD(T4i, T4l);
676 T4w = VSUB(T4i, T4l);
677 T4p = VADD(T4n, T4o);
678 T4s = VADD(T4q, T4r);
679 T4t = VADD(T4p, T4s);
680 T4x = VBYI(VSUB(T4s, T4p));
681 T4u = VCONJ(VMUL(LDK(KP500000000), VSUB(T4m, T4t)));
682 ST(&(Rm[WS(rs, 15)]), T4u, -ms, &(Rm[WS(rs, 1)]));
683 T4z = VMUL(LDK(KP500000000), VADD(T4w, T4x));
684 ST(&(Rp[WS(rs, 8)]), T4z, ms, &(Rp[0]));
685 T4v = VMUL(LDK(KP500000000), VADD(T4m, T4t));
686 ST(&(Rp[0]), T4v, ms, &(Rp[0]));
687 T4y = VCONJ(VMUL(LDK(KP500000000), VSUB(T4w, T4x)));
688 ST(&(Rm[WS(rs, 7)]), T4y, -ms, &(Rm[WS(rs, 1)]));
689 T4A = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
690 T4F = VSUB(T4k, T4j);
691 T4B = VSUB(T4n, T4o);
692 T4C = VSUB(T4r, T4q);
693 T4D = VMUL(LDK(KP353553390), VADD(T4B, T4C));
694 T4G = VMUL(LDK(KP707106781), VSUB(T4C, T4B));
695 T4E = VADD(T4A, T4D);
696 T4L = VMUL(LDK(KP500000000), VBYI(VSUB(T4G, T4F)));
697 T4H = VMUL(LDK(KP500000000), VBYI(VADD(T4F, T4G)));
698 T4K = VSUB(T4A, T4D);
699 T4I = VCONJ(VSUB(T4E, T4H));
700 ST(&(Rm[WS(rs, 3)]), T4I, -ms, &(Rm[WS(rs, 1)]));
701 T4N = VADD(T4K, T4L);
702 ST(&(Rp[WS(rs, 12)]), T4N, ms, &(Rp[0]));
703 T4J = VADD(T4E, T4H);
704 ST(&(Rp[WS(rs, 4)]), T4J, ms, &(Rp[0]));
705 T4M = VCONJ(VSUB(T4K, T4L));
706 ST(&(Rm[WS(rs, 11)]), T4M, -ms, &(Rm[WS(rs, 1)]));
707 T3x = VMUL(LDK(KP353553390), VADD(T3t, T3w));
708 T3y = VADD(T3q, T3x);
709 T47 = VSUB(T3q, T3x);
710 T3S = VMUL(LDK(KP707106781), VSUB(T3w, T3t));
711 T3T = VADD(T3R, T3S);
712 T45 = VSUB(T3S, T3R);
713 T3F = VFMA(LDK(KP923879532), T3B, VMUL(LDK(KP382683432), T3E));
714 T3M = VFNMS(LDK(KP382683432), T3L, VMUL(LDK(KP923879532), T3I));
715 T3N = VMUL(LDK(KP500000000), VADD(T3F, T3M));
716 T44 = VSUB(T3M, T3F);
717 T3U = VFNMS(LDK(KP382683432), T3B, VMUL(LDK(KP923879532), T3E));
718 T3V = VFMA(LDK(KP923879532), T3L, VMUL(LDK(KP382683432), T3I));
719 T3W = VADD(T3U, T3V);
720 T48 = VMUL(LDK(KP500000000), VSUB(T3V, T3U));
721 T3O = VADD(T3y, T3N);
722 T4c = VMUL(LDK(KP500000000), VBYI(VADD(T45, T44)));
723 T4d = VADD(T47, T48);
724 T3X = VMUL(LDK(KP500000000), VBYI(VADD(T3T, T3W)));
725 T40 = VSUB(T3y, T3N);
726 T46 = VMUL(LDK(KP500000000), VBYI(VSUB(T44, T45)));
727 T49 = VSUB(T47, T48);
728 T41 = VMUL(LDK(KP500000000), VBYI(VSUB(T3W, T3T)));
729 T3Y = VCONJ(VSUB(T3O, T3X));
730 ST(&(Rm[WS(rs, 1)]), T3Y, -ms, &(Rm[WS(rs, 1)]));
731 T4e = VADD(T4c, T4d);
732 ST(&(Rp[WS(rs, 6)]), T4e, ms, &(Rp[0]));
733 T4f = VCONJ(VSUB(T4d, T4c));
734 ST(&(Rm[WS(rs, 5)]), T4f, -ms, &(Rm[WS(rs, 1)]));
735 T3Z = VADD(T3O, T3X);
736 ST(&(Rp[WS(rs, 2)]), T3Z, ms, &(Rp[0]));
737 T42 = VCONJ(VSUB(T40, T41));
738 ST(&(Rm[WS(rs, 13)]), T42, -ms, &(Rm[WS(rs, 1)]));
739 T4a = VADD(T46, T49);
740 ST(&(Rp[WS(rs, 10)]), T4a, ms, &(Rp[0]));
741 T4b = VCONJ(VSUB(T49, T46));
742 ST(&(Rm[WS(rs, 9)]), T4b, -ms, &(Rm[WS(rs, 1)]));
743 T43 = VADD(T40, T41);
744 ST(&(Rp[WS(rs, 14)]), T43, ms, &(Rp[0]));
745 {
746 V T2g, T2K, T2L, T2v, T2y, T2E, T2H, T2z, T1k, T2F, T2u, T2G, T2f, T2C, T2r;
747 V T2D, Ty, T1j, T2s, T2t, T1P, T2e, T2n, T2q, T2w, T2M, T2N, T2x, T2A, T2I;
748 V T2J, T2B;
749 Ty = VADD(Ta, Tx);
750 T1j = VMUL(LDK(KP500000000), VADD(TV, T1i));
751 T1k = VADD(Ty, T1j);
752 T2F = VSUB(Ty, T1j);
753 T2s = VFNMS(LDK(KP195090322), T1B, VMUL(LDK(KP980785280), T1O));
754 T2t = VFMA(LDK(KP195090322), T26, VMUL(LDK(KP980785280), T2d));
755 T2u = VADD(T2s, T2t);
756 T2G = VMUL(LDK(KP500000000), VSUB(T2t, T2s));
757 T1P = VFMA(LDK(KP980785280), T1B, VMUL(LDK(KP195090322), T1O));
758 T2e = VFNMS(LDK(KP195090322), T2d, VMUL(LDK(KP980785280), T26));
759 T2f = VMUL(LDK(KP500000000), VADD(T1P, T2e));
760 T2C = VSUB(T2e, T1P);
761 T2n = VSUB(T2h, T2m);
762 T2q = VSUB(T2o, T2p);
763 T2r = VADD(T2n, T2q);
764 T2D = VSUB(T2q, T2n);
765 T2g = VADD(T1k, T2f);
766 T2K = VMUL(LDK(KP500000000), VBYI(VADD(T2D, T2C)));
767 T2L = VADD(T2F, T2G);
768 T2v = VMUL(LDK(KP500000000), VBYI(VADD(T2r, T2u)));
769 T2y = VSUB(T1k, T2f);
770 T2E = VMUL(LDK(KP500000000), VBYI(VSUB(T2C, T2D)));
771 T2H = VSUB(T2F, T2G);
772 T2z = VMUL(LDK(KP500000000), VBYI(VSUB(T2u, T2r)));
773 T2w = VCONJ(VSUB(T2g, T2v));
774 ST(&(Rm[0]), T2w, -ms, &(Rm[0]));
775 T2M = VADD(T2K, T2L);
776 ST(&(Rp[WS(rs, 7)]), T2M, ms, &(Rp[WS(rs, 1)]));
777 T2N = VCONJ(VSUB(T2L, T2K));
778 ST(&(Rm[WS(rs, 6)]), T2N, -ms, &(Rm[0]));
779 T2x = VADD(T2g, T2v);
780 ST(&(Rp[WS(rs, 1)]), T2x, ms, &(Rp[WS(rs, 1)]));
781 T2A = VCONJ(VSUB(T2y, T2z));
782 ST(&(Rm[WS(rs, 14)]), T2A, -ms, &(Rm[0]));
783 T2I = VADD(T2E, T2H);
784 ST(&(Rp[WS(rs, 9)]), T2I, ms, &(Rp[WS(rs, 1)]));
785 T2J = VCONJ(VSUB(T2H, T2E));
786 ST(&(Rm[WS(rs, 8)]), T2J, -ms, &(Rm[0]));
787 T2B = VADD(T2y, T2z);
788 ST(&(Rp[WS(rs, 15)]), T2B, ms, &(Rp[WS(rs, 1)]));
789 }
790 {
791 V T2Y, T3k, T3l, T35, T38, T3e, T3h, T39, T2Q, T3f, T34, T3g, T2X, T3c, T31;
792 V T3d, T2O, T2P, T32, T33, T2T, T2W, T2Z, T30, T36, T3m, T3n, T37, T3a, T3i;
793 V T3j, T3b;
794 T2O = VSUB(Ta, Tx);
795 T2P = VMUL(LDK(KP500000000), VADD(T2p, T2o));
796 T2Q = VADD(T2O, T2P);
797 T3f = VSUB(T2O, T2P);
798 T32 = VFNMS(LDK(KP555570233), T2R, VMUL(LDK(KP831469612), T2S));
799 T33 = VFMA(LDK(KP555570233), T2U, VMUL(LDK(KP831469612), T2V));
800 T34 = VADD(T32, T33);
801 T3g = VMUL(LDK(KP500000000), VSUB(T33, T32));
802 T2T = VFMA(LDK(KP831469612), T2R, VMUL(LDK(KP555570233), T2S));
803 T2W = VFNMS(LDK(KP555570233), T2V, VMUL(LDK(KP831469612), T2U));
804 T2X = VMUL(LDK(KP500000000), VADD(T2T, T2W));
805 T3c = VSUB(T2W, T2T);
806 T2Z = VADD(T2m, T2h);
807 T30 = VSUB(T1i, TV);
808 T31 = VADD(T2Z, T30);
809 T3d = VSUB(T30, T2Z);
810 T2Y = VADD(T2Q, T2X);
811 T3k = VMUL(LDK(KP500000000), VBYI(VADD(T3d, T3c)));
812 T3l = VADD(T3f, T3g);
813 T35 = VMUL(LDK(KP500000000), VBYI(VADD(T31, T34)));
814 T38 = VSUB(T2Q, T2X);
815 T3e = VMUL(LDK(KP500000000), VBYI(VSUB(T3c, T3d)));
816 T3h = VSUB(T3f, T3g);
817 T39 = VMUL(LDK(KP500000000), VBYI(VSUB(T34, T31)));
818 T36 = VCONJ(VSUB(T2Y, T35));
819 ST(&(Rm[WS(rs, 2)]), T36, -ms, &(Rm[0]));
820 T3m = VADD(T3k, T3l);
821 ST(&(Rp[WS(rs, 5)]), T3m, ms, &(Rp[WS(rs, 1)]));
822 T3n = VCONJ(VSUB(T3l, T3k));
823 ST(&(Rm[WS(rs, 4)]), T3n, -ms, &(Rm[0]));
824 T37 = VADD(T2Y, T35);
825 ST(&(Rp[WS(rs, 3)]), T37, ms, &(Rp[WS(rs, 1)]));
826 T3a = VCONJ(VSUB(T38, T39));
827 ST(&(Rm[WS(rs, 12)]), T3a, -ms, &(Rm[0]));
828 T3i = VADD(T3e, T3h);
829 ST(&(Rp[WS(rs, 11)]), T3i, ms, &(Rp[WS(rs, 1)]));
830 T3j = VCONJ(VSUB(T3h, T3e));
831 ST(&(Rm[WS(rs, 10)]), T3j, -ms, &(Rm[0]));
832 T3b = VADD(T38, T39);
833 ST(&(Rp[WS(rs, 13)]), T3b, ms, &(Rp[WS(rs, 1)]));
834 }
835 }
836 }
837 }
838 VLEAVE();
839 }
840
841 static const tw_instr twinstr[] = {
842 VTW(1, 1),
843 VTW(1, 2),
844 VTW(1, 3),
845 VTW(1, 4),
846 VTW(1, 5),
847 VTW(1, 6),
848 VTW(1, 7),
849 VTW(1, 8),
850 VTW(1, 9),
851 VTW(1, 10),
852 VTW(1, 11),
853 VTW(1, 12),
854 VTW(1, 13),
855 VTW(1, 14),
856 VTW(1, 15),
857 VTW(1, 16),
858 VTW(1, 17),
859 VTW(1, 18),
860 VTW(1, 19),
861 VTW(1, 20),
862 VTW(1, 21),
863 VTW(1, 22),
864 VTW(1, 23),
865 VTW(1, 24),
866 VTW(1, 25),
867 VTW(1, 26),
868 VTW(1, 27),
869 VTW(1, 28),
870 VTW(1, 29),
871 VTW(1, 30),
872 VTW(1, 31),
873 {TW_NEXT, VL, 0}
874 };
875
876 static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cfdftv_32"), twinstr, &GENUS, {233, 117, 16, 0} };
877
878 void XSIMD(codelet_hc2cfdftv_32) (planner *p) {
879 X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
880 }
881 #endif /* HAVE_FMA */