comparison src/fftw-3.3.8/rdft/simd/common/hc2cfdftv_32.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu May 24 08:08:11 EDT 2018 */
23
24 #include "rdft/codelet-rdft.h"
25
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include rdft/simd/hc2cfv.h */
29
30 /*
31 * This function contains 249 FP additions, 224 FP multiplications,
32 * (or, 119 additions, 94 multiplications, 130 fused multiply/add),
33 * 154 stack variables, 8 constants, and 64 memory accesses
34 */
35 #include "rdft/simd/hc2cfv.h"
36
37 static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
40 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
41 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
44 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
46 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
47 {
48 INT m;
49 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
50 V T47, T48, T4l, T3w, T3F, T3B, T41, Ts, T2y, T1Q, T2B, T27, T2J, T3a, T40;
51 V T1X, T2C, T43, T44, T4a, T4b, T4m, T3p, T3E, T15, T2K, T1u, T2F, T3h, T3C;
52 V T1n, T2E, T2a, T2z, T1a, T18, TU, T3m, T3f, T1r, T1p, T13, T3n, T3e, TB;
53 V T3k, T1l, T3c, TK, T3j, T1g, T3b, T3l, T3o, TL, T14, T1s, T1t, T3d, T3g;
54 V T1b, T1m, T28, T29, T3Q, T3W, T3T, T3X, T3O, T3P, T3R, T3S, T3U, T3Z, T3V;
55 V T3Y;
56 {
57 V T1U, T1S, T3, T3u, T7, T1z, T1D, T3t, T24, T22, Tc, Tg, Th, T3q, T1J;
58 V Tl, Tp, Tq, T3r, T1O, T3s, T3v, T3z, T3A, T8, Tr, T1E, T1P, T25, T26;
59 V T38, T39, T1V, T1W;
60 {
61 V T1, T2, T5, T6, T1T, T1R, T4, T1x, T1y, T1B, T1C, T1w, T1A, T23, T21;
62 V T1I, T1G, Ta, Tb, T9, T1H, Te, Tf, Td, T1F, T1N, T1L, Tj, Tk, Ti;
63 V T1M, Tn, To, Tm, T1K;
64 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
65 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
66 T1T = LDW(&(W[0]));
67 T1U = VZMULIJ(T1T, VFNMSCONJ(T2, T1));
68 T5 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
69 T6 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
70 T1R = LDW(&(W[TWVL * 32]));
71 T1S = VZMULIJ(T1R, VFNMSCONJ(T6, T5));
72 T3 = VFMACONJ(T2, T1);
73 T3u = VADD(T1U, T1S);
74 T4 = LDW(&(W[TWVL * 30]));
75 T7 = VZMULJ(T4, VFMACONJ(T6, T5));
76 T1x = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
77 T1y = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
78 T1w = LDW(&(W[TWVL * 48]));
79 T1z = VZMULIJ(T1w, VFNMSCONJ(T1y, T1x));
80 T1B = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
81 T1C = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
82 T1A = LDW(&(W[TWVL * 16]));
83 T1D = VZMULIJ(T1A, VFNMSCONJ(T1C, T1B));
84 T3t = VADD(T1D, T1z);
85 T23 = LDW(&(W[TWVL * 46]));
86 T24 = VZMULJ(T23, VFMACONJ(T1y, T1x));
87 T21 = LDW(&(W[TWVL * 14]));
88 T22 = VZMULJ(T21, VFMACONJ(T1C, T1B));
89 Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
90 Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
91 T9 = LDW(&(W[TWVL * 6]));
92 Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
93 T1H = LDW(&(W[TWVL * 8]));
94 T1I = VZMULIJ(T1H, VFNMSCONJ(Tb, Ta));
95 Te = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
96 Tf = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
97 Td = LDW(&(W[TWVL * 38]));
98 Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
99 T1F = LDW(&(W[TWVL * 40]));
100 T1G = VZMULIJ(T1F, VFNMSCONJ(Tf, Te));
101 Th = VSUB(Tc, Tg);
102 T3q = VADD(T1I, T1G);
103 T1J = VSUB(T1G, T1I);
104 Tj = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
105 Tk = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
106 Ti = LDW(&(W[TWVL * 54]));
107 Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
108 T1M = LDW(&(W[TWVL * 56]));
109 T1N = VZMULIJ(T1M, VFNMSCONJ(Tk, Tj));
110 Tn = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
111 To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
112 Tm = LDW(&(W[TWVL * 22]));
113 Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
114 T1K = LDW(&(W[TWVL * 24]));
115 T1L = VZMULIJ(T1K, VFNMSCONJ(To, Tn));
116 Tq = VSUB(Tl, Tp);
117 T3r = VADD(T1N, T1L);
118 T1O = VSUB(T1L, T1N);
119 }
120 T47 = VADD(T3u, T3t);
121 T48 = VADD(T3q, T3r);
122 T4l = VSUB(T48, T47);
123 T3s = VSUB(T3q, T3r);
124 T3v = VSUB(T3t, T3u);
125 T3w = VFNMS(LDK(KP414213562), T3v, T3s);
126 T3F = VFMA(LDK(KP414213562), T3s, T3v);
127 T3z = VADD(Tl, Tp);
128 T3A = VADD(Tc, Tg);
129 T3B = VSUB(T3z, T3A);
130 T41 = VADD(T3A, T3z);
131 T8 = VSUB(T3, T7);
132 Tr = VADD(Th, Tq);
133 Ts = VFNMS(LDK(KP707106781), Tr, T8);
134 T2y = VFMA(LDK(KP707106781), Tr, T8);
135 T1E = VSUB(T1z, T1D);
136 T1P = VSUB(T1J, T1O);
137 T1Q = VFNMS(LDK(KP707106781), T1P, T1E);
138 T2B = VFMA(LDK(KP707106781), T1P, T1E);
139 T25 = VSUB(T22, T24);
140 T26 = VSUB(Tq, Th);
141 T27 = VFMA(LDK(KP707106781), T26, T25);
142 T2J = VFNMS(LDK(KP707106781), T26, T25);
143 T38 = VADD(T3, T7);
144 T39 = VADD(T22, T24);
145 T3a = VSUB(T38, T39);
146 T40 = VADD(T38, T39);
147 T1V = VSUB(T1S, T1U);
148 T1W = VADD(T1J, T1O);
149 T1X = VFNMS(LDK(KP707106781), T1W, T1V);
150 T2C = VFMA(LDK(KP707106781), T1W, T1V);
151 }
152 {
153 V TP, TT, TN, TO, TM, T19, TR, TS, TQ, T17, TY, T12, TW, TX, TV;
154 V T1q, T10, T11, TZ, T1o, Tw, T1i, TA, T1k, Tu, Tv, Tt, T1h, Ty, Tz;
155 V Tx, T1j, TF, T1f, TJ, T1d, TD, TE, TC, T1e, TH, TI, TG, T1c;
156 TN = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
157 TO = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
158 TM = LDW(&(W[TWVL * 10]));
159 TP = VZMULJ(TM, VFMACONJ(TO, TN));
160 T19 = LDW(&(W[TWVL * 12]));
161 T1a = VZMULIJ(T19, VFNMSCONJ(TO, TN));
162 TR = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
163 TS = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
164 TQ = LDW(&(W[TWVL * 42]));
165 TT = VZMULJ(TQ, VFMACONJ(TS, TR));
166 T17 = LDW(&(W[TWVL * 44]));
167 T18 = VZMULIJ(T17, VFNMSCONJ(TS, TR));
168 TU = VSUB(TP, TT);
169 T3m = VADD(T1a, T18);
170 T3f = VADD(TP, TT);
171 TW = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
172 TX = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
173 TV = LDW(&(W[TWVL * 58]));
174 TY = VZMULJ(TV, VFMACONJ(TX, TW));
175 T1q = LDW(&(W[TWVL * 60]));
176 T1r = VZMULIJ(T1q, VFNMSCONJ(TX, TW));
177 T10 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
178 T11 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
179 TZ = LDW(&(W[TWVL * 26]));
180 T12 = VZMULJ(TZ, VFMACONJ(T11, T10));
181 T1o = LDW(&(W[TWVL * 28]));
182 T1p = VZMULIJ(T1o, VFNMSCONJ(T11, T10));
183 T13 = VSUB(TY, T12);
184 T3n = VADD(T1r, T1p);
185 T3e = VADD(TY, T12);
186 Tu = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
187 Tv = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
188 Tt = LDW(&(W[TWVL * 18]));
189 Tw = VZMULJ(Tt, VFMACONJ(Tv, Tu));
190 T1h = LDW(&(W[TWVL * 20]));
191 T1i = VZMULIJ(T1h, VFNMSCONJ(Tv, Tu));
192 Ty = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
193 Tz = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
194 Tx = LDW(&(W[TWVL * 50]));
195 TA = VZMULJ(Tx, VFMACONJ(Tz, Ty));
196 T1j = LDW(&(W[TWVL * 52]));
197 T1k = VZMULIJ(T1j, VFNMSCONJ(Tz, Ty));
198 TB = VSUB(Tw, TA);
199 T3k = VADD(T1i, T1k);
200 T1l = VSUB(T1i, T1k);
201 T3c = VADD(Tw, TA);
202 TD = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
203 TE = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
204 TC = LDW(&(W[TWVL * 2]));
205 TF = VZMULJ(TC, VFMACONJ(TE, TD));
206 T1e = LDW(&(W[TWVL * 4]));
207 T1f = VZMULIJ(T1e, VFNMSCONJ(TE, TD));
208 TH = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
209 TI = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
210 TG = LDW(&(W[TWVL * 34]));
211 TJ = VZMULJ(TG, VFMACONJ(TI, TH));
212 T1c = LDW(&(W[TWVL * 36]));
213 T1d = VZMULIJ(T1c, VFNMSCONJ(TI, TH));
214 TK = VSUB(TF, TJ);
215 T3j = VADD(T1f, T1d);
216 T1g = VSUB(T1d, T1f);
217 T3b = VADD(TF, TJ);
218 }
219 T43 = VADD(T3b, T3c);
220 T44 = VADD(T3e, T3f);
221 T4a = VADD(T3j, T3k);
222 T4b = VADD(T3n, T3m);
223 T4m = VSUB(T4a, T4b);
224 T3l = VSUB(T3j, T3k);
225 T3o = VSUB(T3m, T3n);
226 T3p = VFMA(LDK(KP414213562), T3o, T3l);
227 T3E = VFNMS(LDK(KP414213562), T3l, T3o);
228 TL = VFMA(LDK(KP414213562), TK, TB);
229 T14 = VFNMS(LDK(KP414213562), T13, TU);
230 T15 = VSUB(TL, T14);
231 T2K = VADD(TL, T14);
232 T1s = VSUB(T1p, T1r);
233 T1t = VADD(T1g, T1l);
234 T1u = VFNMS(LDK(KP707106781), T1t, T1s);
235 T2F = VFMA(LDK(KP707106781), T1t, T1s);
236 T3d = VSUB(T3b, T3c);
237 T3g = VSUB(T3e, T3f);
238 T3h = VADD(T3d, T3g);
239 T3C = VSUB(T3g, T3d);
240 T1b = VSUB(T18, T1a);
241 T1m = VSUB(T1g, T1l);
242 T1n = VFNMS(LDK(KP707106781), T1m, T1b);
243 T2E = VFMA(LDK(KP707106781), T1m, T1b);
244 T28 = VFMA(LDK(KP414213562), TU, T13);
245 T29 = VFNMS(LDK(KP414213562), TB, TK);
246 T2a = VSUB(T28, T29);
247 T2z = VADD(T29, T28);
248 {
249 V T4o, T4u, T4r, T4v, T4k, T4n, T4p, T4q, T4s, T4x, T4t, T4w, T3y, T3K, T3H;
250 V T3L, T3i, T3x, T3D, T3G, T3I, T3N, T3J, T3M, T46, T4g, T4d, T4h, T42, T45;
251 V T49, T4c, T4e, T4j, T4f, T4i;
252 T4k = VSUB(T40, T41);
253 T4n = VADD(T4l, T4m);
254 T4o = VFMA(LDK(KP707106781), T4n, T4k);
255 T4u = VFNMS(LDK(KP707106781), T4n, T4k);
256 T4p = VSUB(T44, T43);
257 T4q = VSUB(T4m, T4l);
258 T4r = VFMA(LDK(KP707106781), T4q, T4p);
259 T4v = VFNMS(LDK(KP707106781), T4q, T4p);
260 T4s = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4r, T4o)));
261 ST(&(Rm[WS(rs, 3)]), T4s, -ms, &(Rm[WS(rs, 1)]));
262 T4x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T4v, T4u)));
263 ST(&(Rm[WS(rs, 11)]), T4x, -ms, &(Rm[WS(rs, 1)]));
264 T4t = VMUL(LDK(KP500000000), VFMAI(T4r, T4o));
265 ST(&(Rp[WS(rs, 4)]), T4t, ms, &(Rp[0]));
266 T4w = VMUL(LDK(KP500000000), VFNMSI(T4v, T4u));
267 ST(&(Rp[WS(rs, 12)]), T4w, ms, &(Rp[0]));
268 T3i = VFNMS(LDK(KP707106781), T3h, T3a);
269 T3x = VSUB(T3p, T3w);
270 T3y = VFMA(LDK(KP923879532), T3x, T3i);
271 T3K = VFNMS(LDK(KP923879532), T3x, T3i);
272 T3D = VFNMS(LDK(KP707106781), T3C, T3B);
273 T3G = VSUB(T3E, T3F);
274 T3H = VFNMS(LDK(KP923879532), T3G, T3D);
275 T3L = VFMA(LDK(KP923879532), T3G, T3D);
276 T3I = VMUL(LDK(KP500000000), VFNMSI(T3H, T3y));
277 ST(&(Rp[WS(rs, 6)]), T3I, ms, &(Rp[0]));
278 T3N = VMUL(LDK(KP500000000), VFMAI(T3L, T3K));
279 ST(&(Rp[WS(rs, 10)]), T3N, ms, &(Rp[0]));
280 T3J = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3H, T3y)));
281 ST(&(Rm[WS(rs, 5)]), T3J, -ms, &(Rm[WS(rs, 1)]));
282 T3M = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3L, T3K)));
283 ST(&(Rm[WS(rs, 9)]), T3M, -ms, &(Rm[WS(rs, 1)]));
284 T42 = VADD(T40, T41);
285 T45 = VADD(T43, T44);
286 T46 = VSUB(T42, T45);
287 T4g = VADD(T42, T45);
288 T49 = VADD(T47, T48);
289 T4c = VADD(T4a, T4b);
290 T4d = VSUB(T49, T4c);
291 T4h = VADD(T49, T4c);
292 T4e = VMUL(LDK(KP500000000), VFMAI(T4d, T46));
293 ST(&(Rp[WS(rs, 8)]), T4e, ms, &(Rp[0]));
294 T4j = VCONJ(VMUL(LDK(KP500000000), VADD(T4h, T4g)));
295 ST(&(Rm[WS(rs, 15)]), T4j, -ms, &(Rm[WS(rs, 1)]));
296 T4f = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4d, T46)));
297 ST(&(Rm[WS(rs, 7)]), T4f, -ms, &(Rm[WS(rs, 1)]));
298 T4i = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
299 ST(&(Rp[0]), T4i, ms, &(Rp[0]));
300 }
301 T3O = VFMA(LDK(KP707106781), T3h, T3a);
302 T3P = VADD(T3F, T3E);
303 T3Q = VFMA(LDK(KP923879532), T3P, T3O);
304 T3W = VFNMS(LDK(KP923879532), T3P, T3O);
305 T3R = VFMA(LDK(KP707106781), T3C, T3B);
306 T3S = VADD(T3w, T3p);
307 T3T = VFMA(LDK(KP923879532), T3S, T3R);
308 T3X = VFNMS(LDK(KP923879532), T3S, T3R);
309 T3U = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3T, T3Q)));
310 ST(&(Rm[WS(rs, 1)]), T3U, -ms, &(Rm[WS(rs, 1)]));
311 T3Z = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3X, T3W)));
312 ST(&(Rm[WS(rs, 13)]), T3Z, -ms, &(Rm[WS(rs, 1)]));
313 T3V = VMUL(LDK(KP500000000), VFMAI(T3T, T3Q));
314 ST(&(Rp[WS(rs, 2)]), T3V, ms, &(Rp[0]));
315 T3Y = VMUL(LDK(KP500000000), VFNMSI(T3X, T3W));
316 ST(&(Rp[WS(rs, 14)]), T3Y, ms, &(Rp[0]));
317 {
318 V T2I, T35, T2S, T31, T2P, T34, T2T, T2Y, T2A, T2Z, T2H, T30, T2D, T2G, T2L;
319 V T2W, T2O, T2X, T2M, T2N, T2Q, T36, T37, T2R, T2U, T32, T33, T2V, T20, T2v;
320 V T2i, T2r, T2f, T2u, T2j, T2o, T16, T2p, T1Z, T2q, T1v, T1Y, T2b, T2m, T2e;
321 V T2n, T2c, T2d, T2g, T2w, T2x, T2h, T2k, T2s, T2t, T2l;
322 T2A = VFNMS(LDK(KP923879532), T2z, T2y);
323 T2Z = VFMA(LDK(KP923879532), T2K, T2J);
324 T2D = VFMA(LDK(KP198912367), T2C, T2B);
325 T2G = VFNMS(LDK(KP198912367), T2F, T2E);
326 T2H = VSUB(T2D, T2G);
327 T30 = VADD(T2D, T2G);
328 T2I = VFMA(LDK(KP980785280), T2H, T2A);
329 T35 = VFNMS(LDK(KP980785280), T30, T2Z);
330 T2S = VFNMS(LDK(KP980785280), T2H, T2A);
331 T31 = VFMA(LDK(KP980785280), T30, T2Z);
332 T2L = VFNMS(LDK(KP923879532), T2K, T2J);
333 T2W = VFMA(LDK(KP923879532), T2z, T2y);
334 T2M = VFMA(LDK(KP198912367), T2E, T2F);
335 T2N = VFNMS(LDK(KP198912367), T2B, T2C);
336 T2O = VSUB(T2M, T2N);
337 T2X = VADD(T2N, T2M);
338 T2P = VFMA(LDK(KP980785280), T2O, T2L);
339 T34 = VFNMS(LDK(KP980785280), T2X, T2W);
340 T2T = VFNMS(LDK(KP980785280), T2O, T2L);
341 T2Y = VFMA(LDK(KP980785280), T2X, T2W);
342 T2Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2P, T2I)));
343 ST(&(Rm[WS(rs, 6)]), T2Q, -ms, &(Rm[0]));
344 T36 = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T35, T34)));
345 ST(&(Rm[WS(rs, 14)]), T36, -ms, &(Rm[0]));
346 T37 = VMUL(LDK(KP500000000), VFMAI(T35, T34));
347 ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)]));
348 T2R = VMUL(LDK(KP500000000), VFMAI(T2P, T2I));
349 ST(&(Rp[WS(rs, 7)]), T2R, ms, &(Rp[WS(rs, 1)]));
350 T2U = VMUL(LDK(KP500000000), VFNMSI(T2T, T2S));
351 ST(&(Rp[WS(rs, 9)]), T2U, ms, &(Rp[WS(rs, 1)]));
352 T32 = VMUL(LDK(KP500000000), VFNMSI(T31, T2Y));
353 ST(&(Rp[WS(rs, 1)]), T32, ms, &(Rp[WS(rs, 1)]));
354 T33 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T31, T2Y)));
355 ST(&(Rm[0]), T33, -ms, &(Rm[0]));
356 T2V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2T, T2S)));
357 ST(&(Rm[WS(rs, 8)]), T2V, -ms, &(Rm[0]));
358 T16 = VFNMS(LDK(KP923879532), T15, Ts);
359 T2p = VFMA(LDK(KP923879532), T2a, T27);
360 T1v = VFMA(LDK(KP668178637), T1u, T1n);
361 T1Y = VFNMS(LDK(KP668178637), T1X, T1Q);
362 T1Z = VSUB(T1v, T1Y);
363 T2q = VADD(T1Y, T1v);
364 T20 = VFMA(LDK(KP831469612), T1Z, T16);
365 T2v = VFNMS(LDK(KP831469612), T2q, T2p);
366 T2i = VFNMS(LDK(KP831469612), T1Z, T16);
367 T2r = VFMA(LDK(KP831469612), T2q, T2p);
368 T2b = VFNMS(LDK(KP923879532), T2a, T27);
369 T2m = VFMA(LDK(KP923879532), T15, Ts);
370 T2c = VFNMS(LDK(KP668178637), T1n, T1u);
371 T2d = VFMA(LDK(KP668178637), T1Q, T1X);
372 T2e = VSUB(T2c, T2d);
373 T2n = VADD(T2d, T2c);
374 T2f = VFNMS(LDK(KP831469612), T2e, T2b);
375 T2u = VFNMS(LDK(KP831469612), T2n, T2m);
376 T2j = VFMA(LDK(KP831469612), T2e, T2b);
377 T2o = VFMA(LDK(KP831469612), T2n, T2m);
378 T2g = VMUL(LDK(KP500000000), VFNMSI(T2f, T20));
379 ST(&(Rp[WS(rs, 5)]), T2g, ms, &(Rp[WS(rs, 1)]));
380 T2w = VMUL(LDK(KP500000000), VFNMSI(T2v, T2u));
381 ST(&(Rp[WS(rs, 13)]), T2w, ms, &(Rp[WS(rs, 1)]));
382 T2x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2v, T2u)));
383 ST(&(Rm[WS(rs, 12)]), T2x, -ms, &(Rm[0]));
384 T2h = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2f, T20)));
385 ST(&(Rm[WS(rs, 4)]), T2h, -ms, &(Rm[0]));
386 T2k = VMUL(LDK(KP500000000), VFMAI(T2j, T2i));
387 ST(&(Rp[WS(rs, 11)]), T2k, ms, &(Rp[WS(rs, 1)]));
388 T2s = VMUL(LDK(KP500000000), VFMAI(T2r, T2o));
389 ST(&(Rp[WS(rs, 3)]), T2s, ms, &(Rp[WS(rs, 1)]));
390 T2t = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2r, T2o)));
391 ST(&(Rm[WS(rs, 2)]), T2t, -ms, &(Rm[0]));
392 T2l = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2j, T2i)));
393 ST(&(Rm[WS(rs, 10)]), T2l, -ms, &(Rm[0]));
394 }
395 }
396 }
397 VLEAVE();
398 }
399
400 static const tw_instr twinstr[] = {
401 VTW(1, 1),
402 VTW(1, 2),
403 VTW(1, 3),
404 VTW(1, 4),
405 VTW(1, 5),
406 VTW(1, 6),
407 VTW(1, 7),
408 VTW(1, 8),
409 VTW(1, 9),
410 VTW(1, 10),
411 VTW(1, 11),
412 VTW(1, 12),
413 VTW(1, 13),
414 VTW(1, 14),
415 VTW(1, 15),
416 VTW(1, 16),
417 VTW(1, 17),
418 VTW(1, 18),
419 VTW(1, 19),
420 VTW(1, 20),
421 VTW(1, 21),
422 VTW(1, 22),
423 VTW(1, 23),
424 VTW(1, 24),
425 VTW(1, 25),
426 VTW(1, 26),
427 VTW(1, 27),
428 VTW(1, 28),
429 VTW(1, 29),
430 VTW(1, 30),
431 VTW(1, 31),
432 {TW_NEXT, VL, 0}
433 };
434
435 static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cfdftv_32"), twinstr, &GENUS, {119, 94, 130, 0} };
436
437 void XSIMD(codelet_hc2cfdftv_32) (planner *p) {
438 X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
439 }
440 #else
441
442 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include rdft/simd/hc2cfv.h */
443
444 /*
445 * This function contains 249 FP additions, 133 FP multiplications,
446 * (or, 233 additions, 117 multiplications, 16 fused multiply/add),
447 * 130 stack variables, 9 constants, and 64 memory accesses
448 */
449 #include "rdft/simd/hc2cfv.h"
450
451 static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
452 {
453 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
454 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
455 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
456 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
457 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
458 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
459 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
460 DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
461 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
462 {
463 INT m;
464 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
465 V Ta, T2m, Tx, T2h, T3R, T4h, T3q, T4g, T3B, T4n, T3E, T4o, T1B, T2S, T1O;
466 V T2R, TV, T2p, T1i, T2o, T3L, T4q, T3I, T4r, T3w, T4k, T3t, T4j, T26, T2V;
467 V T2d, T2U;
468 {
469 V T4, T1m, T1H, T2j, T1M, T2l, T9, T1o, Tf, T1r, Tq, T1w, Tv, T1y, Tk;
470 V T1t, Tl, Tw, T3P, T3Q, T3o, T3p, T3z, T3A, T3C, T3D, T1p, T1N, T1A, T1C;
471 V T1u, T1z;
472 {
473 V T1, T3, T2, T1l, T1G, T1F, T1E, T1D, T2i, T1L, T1K, T1J, T1I, T2k, T6;
474 V T8, T7, T5, T1n, Tc, Te, Td, Tb, T1q, Tn, Tp, To, Tm, T1v, Ts;
475 V Tu, Tt, Tr, T1x, Th, Tj, Ti, Tg, T1s;
476 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
477 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
478 T3 = VCONJ(T2);
479 T4 = VADD(T1, T3);
480 T1l = LDW(&(W[0]));
481 T1m = VZMULIJ(T1l, VSUB(T3, T1));
482 T1G = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
483 T1E = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
484 T1F = VCONJ(T1E);
485 T1D = LDW(&(W[TWVL * 16]));
486 T1H = VZMULIJ(T1D, VSUB(T1F, T1G));
487 T2i = LDW(&(W[TWVL * 14]));
488 T2j = VZMULJ(T2i, VADD(T1G, T1F));
489 T1L = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
490 T1J = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
491 T1K = VCONJ(T1J);
492 T1I = LDW(&(W[TWVL * 48]));
493 T1M = VZMULIJ(T1I, VSUB(T1K, T1L));
494 T2k = LDW(&(W[TWVL * 46]));
495 T2l = VZMULJ(T2k, VADD(T1L, T1K));
496 T6 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
497 T7 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
498 T8 = VCONJ(T7);
499 T5 = LDW(&(W[TWVL * 30]));
500 T9 = VZMULJ(T5, VADD(T6, T8));
501 T1n = LDW(&(W[TWVL * 32]));
502 T1o = VZMULIJ(T1n, VSUB(T8, T6));
503 Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
504 Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
505 Te = VCONJ(Td);
506 Tb = LDW(&(W[TWVL * 6]));
507 Tf = VZMULJ(Tb, VADD(Tc, Te));
508 T1q = LDW(&(W[TWVL * 8]));
509 T1r = VZMULIJ(T1q, VSUB(Te, Tc));
510 Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
511 To = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
512 Tp = VCONJ(To);
513 Tm = LDW(&(W[TWVL * 54]));
514 Tq = VZMULJ(Tm, VADD(Tn, Tp));
515 T1v = LDW(&(W[TWVL * 56]));
516 T1w = VZMULIJ(T1v, VSUB(Tp, Tn));
517 Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
518 Tt = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
519 Tu = VCONJ(Tt);
520 Tr = LDW(&(W[TWVL * 22]));
521 Tv = VZMULJ(Tr, VADD(Ts, Tu));
522 T1x = LDW(&(W[TWVL * 24]));
523 T1y = VZMULIJ(T1x, VSUB(Tu, Ts));
524 Th = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
525 Ti = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
526 Tj = VCONJ(Ti);
527 Tg = LDW(&(W[TWVL * 38]));
528 Tk = VZMULJ(Tg, VADD(Th, Tj));
529 T1s = LDW(&(W[TWVL * 40]));
530 T1t = VZMULIJ(T1s, VSUB(Tj, Th));
531 }
532 Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
533 T2m = VSUB(T2j, T2l);
534 Tl = VSUB(Tf, Tk);
535 Tw = VSUB(Tq, Tv);
536 Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
537 T2h = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
538 T3P = VADD(Tq, Tv);
539 T3Q = VADD(Tf, Tk);
540 T3R = VSUB(T3P, T3Q);
541 T4h = VADD(T3Q, T3P);
542 T3o = VADD(T4, T9);
543 T3p = VADD(T2j, T2l);
544 T3q = VMUL(LDK(KP500000000), VSUB(T3o, T3p));
545 T4g = VADD(T3o, T3p);
546 T3z = VADD(T1m, T1o);
547 T3A = VADD(T1H, T1M);
548 T3B = VSUB(T3z, T3A);
549 T4n = VADD(T3z, T3A);
550 T3C = VADD(T1w, T1y);
551 T3D = VADD(T1r, T1t);
552 T3E = VSUB(T3C, T3D);
553 T4o = VADD(T3D, T3C);
554 T1p = VSUB(T1m, T1o);
555 T1N = VSUB(T1H, T1M);
556 T1u = VSUB(T1r, T1t);
557 T1z = VSUB(T1w, T1y);
558 T1A = VMUL(LDK(KP707106781), VADD(T1u, T1z));
559 T1C = VMUL(LDK(KP707106781), VSUB(T1z, T1u));
560 T1B = VADD(T1p, T1A);
561 T2S = VADD(T1N, T1C);
562 T1O = VSUB(T1C, T1N);
563 T2R = VSUB(T1p, T1A);
564 }
565 {
566 V TD, T1R, T1b, T29, T1g, T2b, TI, T1T, TO, T1Y, T10, T22, T15, T24, TT;
567 V T1W, TJ, TU, T16, T1h, T3J, T3K, T3G, T3H, T3u, T3v, T3r, T3s, T25, T2c;
568 V T20, T27, T1U, T1Z;
569 {
570 V TA, TC, TB, Tz, T1Q, T18, T1a, T19, T17, T28, T1d, T1f, T1e, T1c, T2a;
571 V TF, TH, TG, TE, T1S, TL, TN, TM, TK, T1X, TX, TZ, TY, TW, T21;
572 V T12, T14, T13, T11, T23, TQ, TS, TR, TP, T1V;
573 TA = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
574 TB = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
575 TC = VCONJ(TB);
576 Tz = LDW(&(W[TWVL * 2]));
577 TD = VZMULJ(Tz, VADD(TA, TC));
578 T1Q = LDW(&(W[TWVL * 4]));
579 T1R = VZMULIJ(T1Q, VSUB(TC, TA));
580 T18 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
581 T19 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
582 T1a = VCONJ(T19);
583 T17 = LDW(&(W[TWVL * 10]));
584 T1b = VZMULJ(T17, VADD(T18, T1a));
585 T28 = LDW(&(W[TWVL * 12]));
586 T29 = VZMULIJ(T28, VSUB(T1a, T18));
587 T1d = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
588 T1e = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
589 T1f = VCONJ(T1e);
590 T1c = LDW(&(W[TWVL * 42]));
591 T1g = VZMULJ(T1c, VADD(T1d, T1f));
592 T2a = LDW(&(W[TWVL * 44]));
593 T2b = VZMULIJ(T2a, VSUB(T1f, T1d));
594 TF = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
595 TG = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
596 TH = VCONJ(TG);
597 TE = LDW(&(W[TWVL * 34]));
598 TI = VZMULJ(TE, VADD(TF, TH));
599 T1S = LDW(&(W[TWVL * 36]));
600 T1T = VZMULIJ(T1S, VSUB(TH, TF));
601 TL = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
602 TM = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
603 TN = VCONJ(TM);
604 TK = LDW(&(W[TWVL * 18]));
605 TO = VZMULJ(TK, VADD(TL, TN));
606 T1X = LDW(&(W[TWVL * 20]));
607 T1Y = VZMULIJ(T1X, VSUB(TN, TL));
608 TX = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
609 TY = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
610 TZ = VCONJ(TY);
611 TW = LDW(&(W[TWVL * 58]));
612 T10 = VZMULJ(TW, VADD(TX, TZ));
613 T21 = LDW(&(W[TWVL * 60]));
614 T22 = VZMULIJ(T21, VSUB(TZ, TX));
615 T12 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
616 T13 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
617 T14 = VCONJ(T13);
618 T11 = LDW(&(W[TWVL * 26]));
619 T15 = VZMULJ(T11, VADD(T12, T14));
620 T23 = LDW(&(W[TWVL * 28]));
621 T24 = VZMULIJ(T23, VSUB(T14, T12));
622 TQ = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
623 TR = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
624 TS = VCONJ(TR);
625 TP = LDW(&(W[TWVL * 50]));
626 TT = VZMULJ(TP, VADD(TQ, TS));
627 T1V = LDW(&(W[TWVL * 52]));
628 T1W = VZMULIJ(T1V, VSUB(TS, TQ));
629 }
630 TJ = VSUB(TD, TI);
631 TU = VSUB(TO, TT);
632 TV = VFNMS(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TJ));
633 T2p = VFMA(LDK(KP382683432), TJ, VMUL(LDK(KP923879532), TU));
634 T16 = VSUB(T10, T15);
635 T1h = VSUB(T1b, T1g);
636 T1i = VFMA(LDK(KP923879532), T16, VMUL(LDK(KP382683432), T1h));
637 T2o = VFNMS(LDK(KP923879532), T1h, VMUL(LDK(KP382683432), T16));
638 T3J = VADD(T1Y, T1W);
639 T3K = VADD(T1R, T1T);
640 T3L = VSUB(T3J, T3K);
641 T4q = VADD(T3K, T3J);
642 T3G = VADD(T22, T24);
643 T3H = VADD(T29, T2b);
644 T3I = VSUB(T3G, T3H);
645 T4r = VADD(T3G, T3H);
646 T3u = VADD(T10, T15);
647 T3v = VADD(T1b, T1g);
648 T3w = VSUB(T3u, T3v);
649 T4k = VADD(T3u, T3v);
650 T3r = VADD(TD, TI);
651 T3s = VADD(TO, TT);
652 T3t = VSUB(T3r, T3s);
653 T4j = VADD(T3r, T3s);
654 T25 = VSUB(T22, T24);
655 T2c = VSUB(T29, T2b);
656 T1U = VSUB(T1R, T1T);
657 T1Z = VSUB(T1W, T1Y);
658 T20 = VMUL(LDK(KP707106781), VADD(T1U, T1Z));
659 T27 = VMUL(LDK(KP707106781), VSUB(T1Z, T1U));
660 T26 = VADD(T20, T25);
661 T2V = VADD(T27, T2c);
662 T2d = VSUB(T27, T2c);
663 T2U = VSUB(T25, T20);
664 }
665 {
666 V T4m, T4w, T4t, T4x, T4i, T4l, T4p, T4s, T4u, T4z, T4v, T4y, T4E, T4L, T4H;
667 V T4K, T4A, T4F, T4D, T4G, T4B, T4C, T4I, T4N, T4J, T4M, T3O, T4c, T4d, T3X;
668 V T40, T46, T49, T41, T3y, T47, T3T, T45, T3N, T44, T3W, T48, T3x, T3S, T3F;
669 V T3M, T3U, T3V, T3Y, T4e, T4f, T3Z, T42, T4a, T4b, T43;
670 T4i = VADD(T4g, T4h);
671 T4l = VADD(T4j, T4k);
672 T4m = VADD(T4i, T4l);
673 T4w = VSUB(T4i, T4l);
674 T4p = VADD(T4n, T4o);
675 T4s = VADD(T4q, T4r);
676 T4t = VADD(T4p, T4s);
677 T4x = VBYI(VSUB(T4s, T4p));
678 T4u = VCONJ(VMUL(LDK(KP500000000), VSUB(T4m, T4t)));
679 ST(&(Rm[WS(rs, 15)]), T4u, -ms, &(Rm[WS(rs, 1)]));
680 T4z = VMUL(LDK(KP500000000), VADD(T4w, T4x));
681 ST(&(Rp[WS(rs, 8)]), T4z, ms, &(Rp[0]));
682 T4v = VMUL(LDK(KP500000000), VADD(T4m, T4t));
683 ST(&(Rp[0]), T4v, ms, &(Rp[0]));
684 T4y = VCONJ(VMUL(LDK(KP500000000), VSUB(T4w, T4x)));
685 ST(&(Rm[WS(rs, 7)]), T4y, -ms, &(Rm[WS(rs, 1)]));
686 T4A = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
687 T4F = VSUB(T4k, T4j);
688 T4B = VSUB(T4n, T4o);
689 T4C = VSUB(T4r, T4q);
690 T4D = VMUL(LDK(KP353553390), VADD(T4B, T4C));
691 T4G = VMUL(LDK(KP707106781), VSUB(T4C, T4B));
692 T4E = VADD(T4A, T4D);
693 T4L = VMUL(LDK(KP500000000), VBYI(VSUB(T4G, T4F)));
694 T4H = VMUL(LDK(KP500000000), VBYI(VADD(T4F, T4G)));
695 T4K = VSUB(T4A, T4D);
696 T4I = VCONJ(VSUB(T4E, T4H));
697 ST(&(Rm[WS(rs, 3)]), T4I, -ms, &(Rm[WS(rs, 1)]));
698 T4N = VADD(T4K, T4L);
699 ST(&(Rp[WS(rs, 12)]), T4N, ms, &(Rp[0]));
700 T4J = VADD(T4E, T4H);
701 ST(&(Rp[WS(rs, 4)]), T4J, ms, &(Rp[0]));
702 T4M = VCONJ(VSUB(T4K, T4L));
703 ST(&(Rm[WS(rs, 11)]), T4M, -ms, &(Rm[WS(rs, 1)]));
704 T3x = VMUL(LDK(KP353553390), VADD(T3t, T3w));
705 T3y = VADD(T3q, T3x);
706 T47 = VSUB(T3q, T3x);
707 T3S = VMUL(LDK(KP707106781), VSUB(T3w, T3t));
708 T3T = VADD(T3R, T3S);
709 T45 = VSUB(T3S, T3R);
710 T3F = VFMA(LDK(KP923879532), T3B, VMUL(LDK(KP382683432), T3E));
711 T3M = VFNMS(LDK(KP382683432), T3L, VMUL(LDK(KP923879532), T3I));
712 T3N = VMUL(LDK(KP500000000), VADD(T3F, T3M));
713 T44 = VSUB(T3M, T3F);
714 T3U = VFNMS(LDK(KP382683432), T3B, VMUL(LDK(KP923879532), T3E));
715 T3V = VFMA(LDK(KP923879532), T3L, VMUL(LDK(KP382683432), T3I));
716 T3W = VADD(T3U, T3V);
717 T48 = VMUL(LDK(KP500000000), VSUB(T3V, T3U));
718 T3O = VADD(T3y, T3N);
719 T4c = VMUL(LDK(KP500000000), VBYI(VADD(T45, T44)));
720 T4d = VADD(T47, T48);
721 T3X = VMUL(LDK(KP500000000), VBYI(VADD(T3T, T3W)));
722 T40 = VSUB(T3y, T3N);
723 T46 = VMUL(LDK(KP500000000), VBYI(VSUB(T44, T45)));
724 T49 = VSUB(T47, T48);
725 T41 = VMUL(LDK(KP500000000), VBYI(VSUB(T3W, T3T)));
726 T3Y = VCONJ(VSUB(T3O, T3X));
727 ST(&(Rm[WS(rs, 1)]), T3Y, -ms, &(Rm[WS(rs, 1)]));
728 T4e = VADD(T4c, T4d);
729 ST(&(Rp[WS(rs, 6)]), T4e, ms, &(Rp[0]));
730 T4f = VCONJ(VSUB(T4d, T4c));
731 ST(&(Rm[WS(rs, 5)]), T4f, -ms, &(Rm[WS(rs, 1)]));
732 T3Z = VADD(T3O, T3X);
733 ST(&(Rp[WS(rs, 2)]), T3Z, ms, &(Rp[0]));
734 T42 = VCONJ(VSUB(T40, T41));
735 ST(&(Rm[WS(rs, 13)]), T42, -ms, &(Rm[WS(rs, 1)]));
736 T4a = VADD(T46, T49);
737 ST(&(Rp[WS(rs, 10)]), T4a, ms, &(Rp[0]));
738 T4b = VCONJ(VSUB(T49, T46));
739 ST(&(Rm[WS(rs, 9)]), T4b, -ms, &(Rm[WS(rs, 1)]));
740 T43 = VADD(T40, T41);
741 ST(&(Rp[WS(rs, 14)]), T43, ms, &(Rp[0]));
742 {
743 V T2g, T2K, T2L, T2v, T2y, T2E, T2H, T2z, T1k, T2F, T2u, T2G, T2f, T2C, T2r;
744 V T2D, Ty, T1j, T2s, T2t, T1P, T2e, T2n, T2q, T2w, T2M, T2N, T2x, T2A, T2I;
745 V T2J, T2B;
746 Ty = VADD(Ta, Tx);
747 T1j = VMUL(LDK(KP500000000), VADD(TV, T1i));
748 T1k = VADD(Ty, T1j);
749 T2F = VSUB(Ty, T1j);
750 T2s = VFNMS(LDK(KP195090322), T1B, VMUL(LDK(KP980785280), T1O));
751 T2t = VFMA(LDK(KP195090322), T26, VMUL(LDK(KP980785280), T2d));
752 T2u = VADD(T2s, T2t);
753 T2G = VMUL(LDK(KP500000000), VSUB(T2t, T2s));
754 T1P = VFMA(LDK(KP980785280), T1B, VMUL(LDK(KP195090322), T1O));
755 T2e = VFNMS(LDK(KP195090322), T2d, VMUL(LDK(KP980785280), T26));
756 T2f = VMUL(LDK(KP500000000), VADD(T1P, T2e));
757 T2C = VSUB(T2e, T1P);
758 T2n = VSUB(T2h, T2m);
759 T2q = VSUB(T2o, T2p);
760 T2r = VADD(T2n, T2q);
761 T2D = VSUB(T2q, T2n);
762 T2g = VADD(T1k, T2f);
763 T2K = VMUL(LDK(KP500000000), VBYI(VADD(T2D, T2C)));
764 T2L = VADD(T2F, T2G);
765 T2v = VMUL(LDK(KP500000000), VBYI(VADD(T2r, T2u)));
766 T2y = VSUB(T1k, T2f);
767 T2E = VMUL(LDK(KP500000000), VBYI(VSUB(T2C, T2D)));
768 T2H = VSUB(T2F, T2G);
769 T2z = VMUL(LDK(KP500000000), VBYI(VSUB(T2u, T2r)));
770 T2w = VCONJ(VSUB(T2g, T2v));
771 ST(&(Rm[0]), T2w, -ms, &(Rm[0]));
772 T2M = VADD(T2K, T2L);
773 ST(&(Rp[WS(rs, 7)]), T2M, ms, &(Rp[WS(rs, 1)]));
774 T2N = VCONJ(VSUB(T2L, T2K));
775 ST(&(Rm[WS(rs, 6)]), T2N, -ms, &(Rm[0]));
776 T2x = VADD(T2g, T2v);
777 ST(&(Rp[WS(rs, 1)]), T2x, ms, &(Rp[WS(rs, 1)]));
778 T2A = VCONJ(VSUB(T2y, T2z));
779 ST(&(Rm[WS(rs, 14)]), T2A, -ms, &(Rm[0]));
780 T2I = VADD(T2E, T2H);
781 ST(&(Rp[WS(rs, 9)]), T2I, ms, &(Rp[WS(rs, 1)]));
782 T2J = VCONJ(VSUB(T2H, T2E));
783 ST(&(Rm[WS(rs, 8)]), T2J, -ms, &(Rm[0]));
784 T2B = VADD(T2y, T2z);
785 ST(&(Rp[WS(rs, 15)]), T2B, ms, &(Rp[WS(rs, 1)]));
786 }
787 {
788 V T2Y, T3k, T3l, T35, T38, T3e, T3h, T39, T2Q, T3f, T34, T3g, T2X, T3c, T31;
789 V T3d, T2O, T2P, T32, T33, T2T, T2W, T2Z, T30, T36, T3m, T3n, T37, T3a, T3i;
790 V T3j, T3b;
791 T2O = VSUB(Ta, Tx);
792 T2P = VMUL(LDK(KP500000000), VADD(T2p, T2o));
793 T2Q = VADD(T2O, T2P);
794 T3f = VSUB(T2O, T2P);
795 T32 = VFNMS(LDK(KP555570233), T2R, VMUL(LDK(KP831469612), T2S));
796 T33 = VFMA(LDK(KP555570233), T2U, VMUL(LDK(KP831469612), T2V));
797 T34 = VADD(T32, T33);
798 T3g = VMUL(LDK(KP500000000), VSUB(T33, T32));
799 T2T = VFMA(LDK(KP831469612), T2R, VMUL(LDK(KP555570233), T2S));
800 T2W = VFNMS(LDK(KP555570233), T2V, VMUL(LDK(KP831469612), T2U));
801 T2X = VMUL(LDK(KP500000000), VADD(T2T, T2W));
802 T3c = VSUB(T2W, T2T);
803 T2Z = VADD(T2m, T2h);
804 T30 = VSUB(T1i, TV);
805 T31 = VADD(T2Z, T30);
806 T3d = VSUB(T30, T2Z);
807 T2Y = VADD(T2Q, T2X);
808 T3k = VMUL(LDK(KP500000000), VBYI(VADD(T3d, T3c)));
809 T3l = VADD(T3f, T3g);
810 T35 = VMUL(LDK(KP500000000), VBYI(VADD(T31, T34)));
811 T38 = VSUB(T2Q, T2X);
812 T3e = VMUL(LDK(KP500000000), VBYI(VSUB(T3c, T3d)));
813 T3h = VSUB(T3f, T3g);
814 T39 = VMUL(LDK(KP500000000), VBYI(VSUB(T34, T31)));
815 T36 = VCONJ(VSUB(T2Y, T35));
816 ST(&(Rm[WS(rs, 2)]), T36, -ms, &(Rm[0]));
817 T3m = VADD(T3k, T3l);
818 ST(&(Rp[WS(rs, 5)]), T3m, ms, &(Rp[WS(rs, 1)]));
819 T3n = VCONJ(VSUB(T3l, T3k));
820 ST(&(Rm[WS(rs, 4)]), T3n, -ms, &(Rm[0]));
821 T37 = VADD(T2Y, T35);
822 ST(&(Rp[WS(rs, 3)]), T37, ms, &(Rp[WS(rs, 1)]));
823 T3a = VCONJ(VSUB(T38, T39));
824 ST(&(Rm[WS(rs, 12)]), T3a, -ms, &(Rm[0]));
825 T3i = VADD(T3e, T3h);
826 ST(&(Rp[WS(rs, 11)]), T3i, ms, &(Rp[WS(rs, 1)]));
827 T3j = VCONJ(VSUB(T3h, T3e));
828 ST(&(Rm[WS(rs, 10)]), T3j, -ms, &(Rm[0]));
829 T3b = VADD(T38, T39);
830 ST(&(Rp[WS(rs, 13)]), T3b, ms, &(Rp[WS(rs, 1)]));
831 }
832 }
833 }
834 }
835 VLEAVE();
836 }
837
838 static const tw_instr twinstr[] = {
839 VTW(1, 1),
840 VTW(1, 2),
841 VTW(1, 3),
842 VTW(1, 4),
843 VTW(1, 5),
844 VTW(1, 6),
845 VTW(1, 7),
846 VTW(1, 8),
847 VTW(1, 9),
848 VTW(1, 10),
849 VTW(1, 11),
850 VTW(1, 12),
851 VTW(1, 13),
852 VTW(1, 14),
853 VTW(1, 15),
854 VTW(1, 16),
855 VTW(1, 17),
856 VTW(1, 18),
857 VTW(1, 19),
858 VTW(1, 20),
859 VTW(1, 21),
860 VTW(1, 22),
861 VTW(1, 23),
862 VTW(1, 24),
863 VTW(1, 25),
864 VTW(1, 26),
865 VTW(1, 27),
866 VTW(1, 28),
867 VTW(1, 29),
868 VTW(1, 30),
869 VTW(1, 31),
870 {TW_NEXT, VL, 0}
871 };
872
873 static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cfdftv_32"), twinstr, &GENUS, {233, 117, 16, 0} };
874
875 void XSIMD(codelet_hc2cfdftv_32) (planner *p) {
876 X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
877 }
878 #endif