annotate src/fftw-3.3.8/dft/simd/common/t2bv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:02 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2bv_32 -include dft/simd/t2b.h -sign 1 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 217 FP additions, 160 FP multiplications,
Chris@82 32 * (or, 119 additions, 62 multiplications, 98 fused multiply/add),
Chris@82 33 * 59 stack variables, 7 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t2b.h"
Chris@82 36
Chris@82 37 static void t2bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 40 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 41 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 44 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 46 {
Chris@82 47 INT m;
Chris@82 48 R *x;
Chris@82 49 x = ii;
Chris@82 50 for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 51 V T4, T1z, T2o, T32, Tf, T1A, T2r, T3f, TC, T1D, T2O, T34, Tr, T1C, T2L;
Chris@82 52 V T33, T1k, T20, T2F, T3b, T1r, T21, T2C, T3a, TV, T1X, T2y, T38, T12, T1Y;
Chris@82 53 V T2v, T37;
Chris@82 54 {
Chris@82 55 V T1, T1y, T3, T1w, T1x, T2, T1v, T2m, T2n;
Chris@82 56 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 57 T1x = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@82 58 T1y = BYTW(&(W[TWVL * 46]), T1x);
Chris@82 59 T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 60 T3 = BYTW(&(W[TWVL * 30]), T2);
Chris@82 61 T1v = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 62 T1w = BYTW(&(W[TWVL * 14]), T1v);
Chris@82 63 T4 = VSUB(T1, T3);
Chris@82 64 T1z = VSUB(T1w, T1y);
Chris@82 65 T2m = VADD(T1, T3);
Chris@82 66 T2n = VADD(T1w, T1y);
Chris@82 67 T2o = VADD(T2m, T2n);
Chris@82 68 T32 = VSUB(T2m, T2n);
Chris@82 69 }
Chris@82 70 {
Chris@82 71 V T6, Td, T8, Tb;
Chris@82 72 {
Chris@82 73 V T5, Tc, T7, Ta;
Chris@82 74 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 75 T6 = BYTW(&(W[TWVL * 6]), T5);
Chris@82 76 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 77 Td = BYTW(&(W[TWVL * 22]), Tc);
Chris@82 78 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@82 79 T8 = BYTW(&(W[TWVL * 38]), T7);
Chris@82 80 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@82 81 Tb = BYTW(&(W[TWVL * 54]), Ta);
Chris@82 82 }
Chris@82 83 {
Chris@82 84 V T9, Te, T2p, T2q;
Chris@82 85 T9 = VSUB(T6, T8);
Chris@82 86 Te = VSUB(Tb, Td);
Chris@82 87 Tf = VADD(T9, Te);
Chris@82 88 T1A = VSUB(T9, Te);
Chris@82 89 T2p = VADD(T6, T8);
Chris@82 90 T2q = VADD(Tb, Td);
Chris@82 91 T2r = VADD(T2p, T2q);
Chris@82 92 T3f = VSUB(T2p, T2q);
Chris@82 93 }
Chris@82 94 }
Chris@82 95 {
Chris@82 96 V Tt, TA, Tv, Ty;
Chris@82 97 {
Chris@82 98 V Ts, Tz, Tu, Tx;
Chris@82 99 Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@82 100 Tt = BYTW(&(W[TWVL * 58]), Ts);
Chris@82 101 Tz = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 102 TA = BYTW(&(W[TWVL * 10]), Tz);
Chris@82 103 Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 104 Tv = BYTW(&(W[TWVL * 26]), Tu);
Chris@82 105 Tx = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@82 106 Ty = BYTW(&(W[TWVL * 42]), Tx);
Chris@82 107 }
Chris@82 108 {
Chris@82 109 V Tw, TB, T2M, T2N;
Chris@82 110 Tw = VSUB(Tt, Tv);
Chris@82 111 TB = VSUB(Ty, TA);
Chris@82 112 TC = VFNMS(LDK(KP414213562), TB, Tw);
Chris@82 113 T1D = VFMA(LDK(KP414213562), Tw, TB);
Chris@82 114 T2M = VADD(Tt, Tv);
Chris@82 115 T2N = VADD(TA, Ty);
Chris@82 116 T2O = VADD(T2M, T2N);
Chris@82 117 T34 = VSUB(T2M, T2N);
Chris@82 118 }
Chris@82 119 }
Chris@82 120 {
Chris@82 121 V Ti, Tp, Tk, Tn;
Chris@82 122 {
Chris@82 123 V Th, To, Tj, Tm;
Chris@82 124 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 125 Ti = BYTW(&(W[TWVL * 2]), Th);
Chris@82 126 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@82 127 Tp = BYTW(&(W[TWVL * 50]), To);
Chris@82 128 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 129 Tk = BYTW(&(W[TWVL * 34]), Tj);
Chris@82 130 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 131 Tn = BYTW(&(W[TWVL * 18]), Tm);
Chris@82 132 }
Chris@82 133 {
Chris@82 134 V Tl, Tq, T2J, T2K;
Chris@82 135 Tl = VSUB(Ti, Tk);
Chris@82 136 Tq = VSUB(Tn, Tp);
Chris@82 137 Tr = VFNMS(LDK(KP414213562), Tq, Tl);
Chris@82 138 T1C = VFMA(LDK(KP414213562), Tl, Tq);
Chris@82 139 T2J = VADD(Ti, Tk);
Chris@82 140 T2K = VADD(Tn, Tp);
Chris@82 141 T2L = VADD(T2J, T2K);
Chris@82 142 T33 = VSUB(T2J, T2K);
Chris@82 143 }
Chris@82 144 }
Chris@82 145 {
Chris@82 146 V T15, T17, T1o, T1m, T1f, T1h, T1i, T1a, T1c, T1d;
Chris@82 147 {
Chris@82 148 V T14, T16, T1n, T1l;
Chris@82 149 T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@82 150 T15 = BYTW(&(W[TWVL * 60]), T14);
Chris@82 151 T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 152 T17 = BYTW(&(W[TWVL * 28]), T16);
Chris@82 153 T1n = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 154 T1o = BYTW(&(W[TWVL * 12]), T1n);
Chris@82 155 T1l = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@82 156 T1m = BYTW(&(W[TWVL * 44]), T1l);
Chris@82 157 {
Chris@82 158 V T1e, T1g, T19, T1b;
Chris@82 159 T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@82 160 T1f = BYTW(&(W[TWVL * 52]), T1e);
Chris@82 161 T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 162 T1h = BYTW(&(W[TWVL * 20]), T1g);
Chris@82 163 T1i = VSUB(T1f, T1h);
Chris@82 164 T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 165 T1a = BYTW(&(W[TWVL * 4]), T19);
Chris@82 166 T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 167 T1c = BYTW(&(W[TWVL * 36]), T1b);
Chris@82 168 T1d = VSUB(T1a, T1c);
Chris@82 169 }
Chris@82 170 }
Chris@82 171 {
Chris@82 172 V T18, T1j, T2D, T2E;
Chris@82 173 T18 = VSUB(T15, T17);
Chris@82 174 T1j = VADD(T1d, T1i);
Chris@82 175 T1k = VFMA(LDK(KP707106781), T1j, T18);
Chris@82 176 T20 = VFNMS(LDK(KP707106781), T1j, T18);
Chris@82 177 T2D = VADD(T1a, T1c);
Chris@82 178 T2E = VADD(T1f, T1h);
Chris@82 179 T2F = VADD(T2D, T2E);
Chris@82 180 T3b = VSUB(T2E, T2D);
Chris@82 181 }
Chris@82 182 {
Chris@82 183 V T1p, T1q, T2A, T2B;
Chris@82 184 T1p = VSUB(T1m, T1o);
Chris@82 185 T1q = VSUB(T1i, T1d);
Chris@82 186 T1r = VFMA(LDK(KP707106781), T1q, T1p);
Chris@82 187 T21 = VFNMS(LDK(KP707106781), T1q, T1p);
Chris@82 188 T2A = VADD(T15, T17);
Chris@82 189 T2B = VADD(T1o, T1m);
Chris@82 190 T2C = VADD(T2A, T2B);
Chris@82 191 T3a = VSUB(T2A, T2B);
Chris@82 192 }
Chris@82 193 }
Chris@82 194 {
Chris@82 195 V TG, TI, TZ, TX, TQ, TS, TT, TL, TN, TO;
Chris@82 196 {
Chris@82 197 V TF, TH, TY, TW;
Chris@82 198 TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 199 TG = BYTW(&(W[0]), TF);
Chris@82 200 TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 201 TI = BYTW(&(W[TWVL * 32]), TH);
Chris@82 202 TY = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@82 203 TZ = BYTW(&(W[TWVL * 48]), TY);
Chris@82 204 TW = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 205 TX = BYTW(&(W[TWVL * 16]), TW);
Chris@82 206 {
Chris@82 207 V TP, TR, TK, TM;
Chris@82 208 TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@82 209 TQ = BYTW(&(W[TWVL * 56]), TP);
Chris@82 210 TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 211 TS = BYTW(&(W[TWVL * 24]), TR);
Chris@82 212 TT = VSUB(TQ, TS);
Chris@82 213 TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 214 TL = BYTW(&(W[TWVL * 8]), TK);
Chris@82 215 TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@82 216 TN = BYTW(&(W[TWVL * 40]), TM);
Chris@82 217 TO = VSUB(TL, TN);
Chris@82 218 }
Chris@82 219 }
Chris@82 220 {
Chris@82 221 V TJ, TU, T2w, T2x;
Chris@82 222 TJ = VSUB(TG, TI);
Chris@82 223 TU = VADD(TO, TT);
Chris@82 224 TV = VFMA(LDK(KP707106781), TU, TJ);
Chris@82 225 T1X = VFNMS(LDK(KP707106781), TU, TJ);
Chris@82 226 T2w = VADD(TL, TN);
Chris@82 227 T2x = VADD(TQ, TS);
Chris@82 228 T2y = VADD(T2w, T2x);
Chris@82 229 T38 = VSUB(T2w, T2x);
Chris@82 230 }
Chris@82 231 {
Chris@82 232 V T10, T11, T2t, T2u;
Chris@82 233 T10 = VSUB(TX, TZ);
Chris@82 234 T11 = VSUB(TO, TT);
Chris@82 235 T12 = VFMA(LDK(KP707106781), T11, T10);
Chris@82 236 T1Y = VFNMS(LDK(KP707106781), T11, T10);
Chris@82 237 T2t = VADD(TG, TI);
Chris@82 238 T2u = VADD(TX, TZ);
Chris@82 239 T2v = VADD(T2t, T2u);
Chris@82 240 T37 = VSUB(T2t, T2u);
Chris@82 241 }
Chris@82 242 }
Chris@82 243 {
Chris@82 244 V T2W, T30, T2Z, T31;
Chris@82 245 {
Chris@82 246 V T2U, T2V, T2X, T2Y;
Chris@82 247 T2U = VADD(T2o, T2r);
Chris@82 248 T2V = VADD(T2L, T2O);
Chris@82 249 T2W = VSUB(T2U, T2V);
Chris@82 250 T30 = VADD(T2U, T2V);
Chris@82 251 T2X = VADD(T2v, T2y);
Chris@82 252 T2Y = VADD(T2C, T2F);
Chris@82 253 T2Z = VSUB(T2X, T2Y);
Chris@82 254 T31 = VADD(T2X, T2Y);
Chris@82 255 }
Chris@82 256 ST(&(x[WS(rs, 24)]), VFNMSI(T2Z, T2W), ms, &(x[0]));
Chris@82 257 ST(&(x[0]), VADD(T30, T31), ms, &(x[0]));
Chris@82 258 ST(&(x[WS(rs, 8)]), VFMAI(T2Z, T2W), ms, &(x[0]));
Chris@82 259 ST(&(x[WS(rs, 16)]), VSUB(T30, T31), ms, &(x[0]));
Chris@82 260 }
Chris@82 261 {
Chris@82 262 V T2s, T2P, T2H, T2Q, T2z, T2G;
Chris@82 263 T2s = VSUB(T2o, T2r);
Chris@82 264 T2P = VSUB(T2L, T2O);
Chris@82 265 T2z = VSUB(T2v, T2y);
Chris@82 266 T2G = VSUB(T2C, T2F);
Chris@82 267 T2H = VADD(T2z, T2G);
Chris@82 268 T2Q = VSUB(T2z, T2G);
Chris@82 269 {
Chris@82 270 V T2I, T2R, T2S, T2T;
Chris@82 271 T2I = VFNMS(LDK(KP707106781), T2H, T2s);
Chris@82 272 T2R = VFNMS(LDK(KP707106781), T2Q, T2P);
Chris@82 273 ST(&(x[WS(rs, 12)]), VFNMSI(T2R, T2I), ms, &(x[0]));
Chris@82 274 ST(&(x[WS(rs, 20)]), VFMAI(T2R, T2I), ms, &(x[0]));
Chris@82 275 T2S = VFMA(LDK(KP707106781), T2H, T2s);
Chris@82 276 T2T = VFMA(LDK(KP707106781), T2Q, T2P);
Chris@82 277 ST(&(x[WS(rs, 4)]), VFMAI(T2T, T2S), ms, &(x[0]));
Chris@82 278 ST(&(x[WS(rs, 28)]), VFNMSI(T2T, T2S), ms, &(x[0]));
Chris@82 279 }
Chris@82 280 }
Chris@82 281 {
Chris@82 282 V T36, T3o, T3h, T3r, T3d, T3s, T3k, T3p, T35, T3g;
Chris@82 283 T35 = VADD(T33, T34);
Chris@82 284 T36 = VFMA(LDK(KP707106781), T35, T32);
Chris@82 285 T3o = VFNMS(LDK(KP707106781), T35, T32);
Chris@82 286 T3g = VSUB(T33, T34);
Chris@82 287 T3h = VFMA(LDK(KP707106781), T3g, T3f);
Chris@82 288 T3r = VFNMS(LDK(KP707106781), T3g, T3f);
Chris@82 289 {
Chris@82 290 V T39, T3c, T3i, T3j;
Chris@82 291 T39 = VFNMS(LDK(KP414213562), T38, T37);
Chris@82 292 T3c = VFNMS(LDK(KP414213562), T3b, T3a);
Chris@82 293 T3d = VADD(T39, T3c);
Chris@82 294 T3s = VSUB(T39, T3c);
Chris@82 295 T3i = VFMA(LDK(KP414213562), T37, T38);
Chris@82 296 T3j = VFMA(LDK(KP414213562), T3a, T3b);
Chris@82 297 T3k = VSUB(T3i, T3j);
Chris@82 298 T3p = VADD(T3i, T3j);
Chris@82 299 }
Chris@82 300 {
Chris@82 301 V T3e, T3l, T3u, T3v;
Chris@82 302 T3e = VFNMS(LDK(KP923879532), T3d, T36);
Chris@82 303 T3l = VFNMS(LDK(KP923879532), T3k, T3h);
Chris@82 304 ST(&(x[WS(rs, 14)]), VFNMSI(T3l, T3e), ms, &(x[0]));
Chris@82 305 ST(&(x[WS(rs, 18)]), VFMAI(T3l, T3e), ms, &(x[0]));
Chris@82 306 T3u = VFMA(LDK(KP923879532), T3p, T3o);
Chris@82 307 T3v = VFNMS(LDK(KP923879532), T3s, T3r);
Chris@82 308 ST(&(x[WS(rs, 6)]), VFNMSI(T3v, T3u), ms, &(x[0]));
Chris@82 309 ST(&(x[WS(rs, 26)]), VFMAI(T3v, T3u), ms, &(x[0]));
Chris@82 310 }
Chris@82 311 {
Chris@82 312 V T3m, T3n, T3q, T3t;
Chris@82 313 T3m = VFMA(LDK(KP923879532), T3d, T36);
Chris@82 314 T3n = VFMA(LDK(KP923879532), T3k, T3h);
Chris@82 315 ST(&(x[WS(rs, 30)]), VFNMSI(T3n, T3m), ms, &(x[0]));
Chris@82 316 ST(&(x[WS(rs, 2)]), VFMAI(T3n, T3m), ms, &(x[0]));
Chris@82 317 T3q = VFNMS(LDK(KP923879532), T3p, T3o);
Chris@82 318 T3t = VFMA(LDK(KP923879532), T3s, T3r);
Chris@82 319 ST(&(x[WS(rs, 10)]), VFMAI(T3t, T3q), ms, &(x[0]));
Chris@82 320 ST(&(x[WS(rs, 22)]), VFNMSI(T3t, T3q), ms, &(x[0]));
Chris@82 321 }
Chris@82 322 }
Chris@82 323 {
Chris@82 324 V TE, T1M, T1I, T1N, T1t, T1Q, T1F, T1P;
Chris@82 325 {
Chris@82 326 V Tg, TD, T1G, T1H;
Chris@82 327 Tg = VFMA(LDK(KP707106781), Tf, T4);
Chris@82 328 TD = VADD(Tr, TC);
Chris@82 329 TE = VFMA(LDK(KP923879532), TD, Tg);
Chris@82 330 T1M = VFNMS(LDK(KP923879532), TD, Tg);
Chris@82 331 T1G = VFMA(LDK(KP198912367), TV, T12);
Chris@82 332 T1H = VFMA(LDK(KP198912367), T1k, T1r);
Chris@82 333 T1I = VSUB(T1G, T1H);
Chris@82 334 T1N = VADD(T1G, T1H);
Chris@82 335 }
Chris@82 336 {
Chris@82 337 V T13, T1s, T1B, T1E;
Chris@82 338 T13 = VFNMS(LDK(KP198912367), T12, TV);
Chris@82 339 T1s = VFNMS(LDK(KP198912367), T1r, T1k);
Chris@82 340 T1t = VADD(T13, T1s);
Chris@82 341 T1Q = VSUB(T13, T1s);
Chris@82 342 T1B = VFMA(LDK(KP707106781), T1A, T1z);
Chris@82 343 T1E = VSUB(T1C, T1D);
Chris@82 344 T1F = VFMA(LDK(KP923879532), T1E, T1B);
Chris@82 345 T1P = VFNMS(LDK(KP923879532), T1E, T1B);
Chris@82 346 }
Chris@82 347 {
Chris@82 348 V T1u, T1J, T1S, T1T;
Chris@82 349 T1u = VFNMS(LDK(KP980785280), T1t, TE);
Chris@82 350 T1J = VFNMS(LDK(KP980785280), T1I, T1F);
Chris@82 351 ST(&(x[WS(rs, 15)]), VFNMSI(T1J, T1u), ms, &(x[WS(rs, 1)]));
Chris@82 352 ST(&(x[WS(rs, 17)]), VFMAI(T1J, T1u), ms, &(x[WS(rs, 1)]));
Chris@82 353 T1S = VFMA(LDK(KP980785280), T1N, T1M);
Chris@82 354 T1T = VFNMS(LDK(KP980785280), T1Q, T1P);
Chris@82 355 ST(&(x[WS(rs, 7)]), VFNMSI(T1T, T1S), ms, &(x[WS(rs, 1)]));
Chris@82 356 ST(&(x[WS(rs, 25)]), VFMAI(T1T, T1S), ms, &(x[WS(rs, 1)]));
Chris@82 357 }
Chris@82 358 {
Chris@82 359 V T1K, T1L, T1O, T1R;
Chris@82 360 T1K = VFMA(LDK(KP980785280), T1t, TE);
Chris@82 361 T1L = VFMA(LDK(KP980785280), T1I, T1F);
Chris@82 362 ST(&(x[WS(rs, 31)]), VFNMSI(T1L, T1K), ms, &(x[WS(rs, 1)]));
Chris@82 363 ST(&(x[WS(rs, 1)]), VFMAI(T1L, T1K), ms, &(x[WS(rs, 1)]));
Chris@82 364 T1O = VFNMS(LDK(KP980785280), T1N, T1M);
Chris@82 365 T1R = VFMA(LDK(KP980785280), T1Q, T1P);
Chris@82 366 ST(&(x[WS(rs, 9)]), VFMAI(T1R, T1O), ms, &(x[WS(rs, 1)]));
Chris@82 367 ST(&(x[WS(rs, 23)]), VFNMSI(T1R, T1O), ms, &(x[WS(rs, 1)]));
Chris@82 368 }
Chris@82 369 }
Chris@82 370 {
Chris@82 371 V T1W, T2e, T2a, T2f, T23, T2i, T27, T2h;
Chris@82 372 {
Chris@82 373 V T1U, T1V, T28, T29;
Chris@82 374 T1U = VFNMS(LDK(KP707106781), Tf, T4);
Chris@82 375 T1V = VADD(T1C, T1D);
Chris@82 376 T1W = VFMA(LDK(KP923879532), T1V, T1U);
Chris@82 377 T2e = VFNMS(LDK(KP923879532), T1V, T1U);
Chris@82 378 T28 = VFNMS(LDK(KP668178637), T1X, T1Y);
Chris@82 379 T29 = VFNMS(LDK(KP668178637), T20, T21);
Chris@82 380 T2a = VSUB(T28, T29);
Chris@82 381 T2f = VADD(T28, T29);
Chris@82 382 }
Chris@82 383 {
Chris@82 384 V T1Z, T22, T25, T26;
Chris@82 385 T1Z = VFMA(LDK(KP668178637), T1Y, T1X);
Chris@82 386 T22 = VFMA(LDK(KP668178637), T21, T20);
Chris@82 387 T23 = VADD(T1Z, T22);
Chris@82 388 T2i = VSUB(T1Z, T22);
Chris@82 389 T25 = VFNMS(LDK(KP707106781), T1A, T1z);
Chris@82 390 T26 = VSUB(Tr, TC);
Chris@82 391 T27 = VFNMS(LDK(KP923879532), T26, T25);
Chris@82 392 T2h = VFMA(LDK(KP923879532), T26, T25);
Chris@82 393 }
Chris@82 394 {
Chris@82 395 V T24, T2b, T2k, T2l;
Chris@82 396 T24 = VFNMS(LDK(KP831469612), T23, T1W);
Chris@82 397 T2b = VFNMS(LDK(KP831469612), T2a, T27);
Chris@82 398 ST(&(x[WS(rs, 19)]), VFNMSI(T2b, T24), ms, &(x[WS(rs, 1)]));
Chris@82 399 ST(&(x[WS(rs, 13)]), VFMAI(T2b, T24), ms, &(x[WS(rs, 1)]));
Chris@82 400 T2k = VFNMS(LDK(KP831469612), T2f, T2e);
Chris@82 401 T2l = VFMA(LDK(KP831469612), T2i, T2h);
Chris@82 402 ST(&(x[WS(rs, 5)]), VFMAI(T2l, T2k), ms, &(x[WS(rs, 1)]));
Chris@82 403 ST(&(x[WS(rs, 27)]), VFNMSI(T2l, T2k), ms, &(x[WS(rs, 1)]));
Chris@82 404 }
Chris@82 405 {
Chris@82 406 V T2c, T2d, T2g, T2j;
Chris@82 407 T2c = VFMA(LDK(KP831469612), T23, T1W);
Chris@82 408 T2d = VFMA(LDK(KP831469612), T2a, T27);
Chris@82 409 ST(&(x[WS(rs, 3)]), VFNMSI(T2d, T2c), ms, &(x[WS(rs, 1)]));
Chris@82 410 ST(&(x[WS(rs, 29)]), VFMAI(T2d, T2c), ms, &(x[WS(rs, 1)]));
Chris@82 411 T2g = VFMA(LDK(KP831469612), T2f, T2e);
Chris@82 412 T2j = VFNMS(LDK(KP831469612), T2i, T2h);
Chris@82 413 ST(&(x[WS(rs, 11)]), VFNMSI(T2j, T2g), ms, &(x[WS(rs, 1)]));
Chris@82 414 ST(&(x[WS(rs, 21)]), VFMAI(T2j, T2g), ms, &(x[WS(rs, 1)]));
Chris@82 415 }
Chris@82 416 }
Chris@82 417 }
Chris@82 418 }
Chris@82 419 VLEAVE();
Chris@82 420 }
Chris@82 421
Chris@82 422 static const tw_instr twinstr[] = {
Chris@82 423 VTW(0, 1),
Chris@82 424 VTW(0, 2),
Chris@82 425 VTW(0, 3),
Chris@82 426 VTW(0, 4),
Chris@82 427 VTW(0, 5),
Chris@82 428 VTW(0, 6),
Chris@82 429 VTW(0, 7),
Chris@82 430 VTW(0, 8),
Chris@82 431 VTW(0, 9),
Chris@82 432 VTW(0, 10),
Chris@82 433 VTW(0, 11),
Chris@82 434 VTW(0, 12),
Chris@82 435 VTW(0, 13),
Chris@82 436 VTW(0, 14),
Chris@82 437 VTW(0, 15),
Chris@82 438 VTW(0, 16),
Chris@82 439 VTW(0, 17),
Chris@82 440 VTW(0, 18),
Chris@82 441 VTW(0, 19),
Chris@82 442 VTW(0, 20),
Chris@82 443 VTW(0, 21),
Chris@82 444 VTW(0, 22),
Chris@82 445 VTW(0, 23),
Chris@82 446 VTW(0, 24),
Chris@82 447 VTW(0, 25),
Chris@82 448 VTW(0, 26),
Chris@82 449 VTW(0, 27),
Chris@82 450 VTW(0, 28),
Chris@82 451 VTW(0, 29),
Chris@82 452 VTW(0, 30),
Chris@82 453 VTW(0, 31),
Chris@82 454 {TW_NEXT, VL, 0}
Chris@82 455 };
Chris@82 456
Chris@82 457 static const ct_desc desc = { 32, XSIMD_STRING("t2bv_32"), twinstr, &GENUS, {119, 62, 98, 0}, 0, 0, 0 };
Chris@82 458
Chris@82 459 void XSIMD(codelet_t2bv_32) (planner *p) {
Chris@82 460 X(kdft_dit_register) (p, t2bv_32, &desc);
Chris@82 461 }
Chris@82 462 #else
Chris@82 463
Chris@82 464 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2bv_32 -include dft/simd/t2b.h -sign 1 */
Chris@82 465
Chris@82 466 /*
Chris@82 467 * This function contains 217 FP additions, 104 FP multiplications,
Chris@82 468 * (or, 201 additions, 88 multiplications, 16 fused multiply/add),
Chris@82 469 * 59 stack variables, 7 constants, and 64 memory accesses
Chris@82 470 */
Chris@82 471 #include "dft/simd/t2b.h"
Chris@82 472
Chris@82 473 static void t2bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 474 {
Chris@82 475 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 476 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 477 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 478 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 479 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 480 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 481 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 482 {
Chris@82 483 INT m;
Chris@82 484 R *x;
Chris@82 485 x = ii;
Chris@82 486 for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 487 V T4, T1D, T2P, T3h, Tf, T1y, T2K, T3i, TC, T1w, T2G, T3e, Tr, T1v, T2D;
Chris@82 488 V T3d, T1k, T20, T2y, T3a, T1r, T21, T2v, T39, TV, T1X, T2r, T37, T12, T1Y;
Chris@82 489 V T2o, T36;
Chris@82 490 {
Chris@82 491 V T1, T1C, T3, T1A, T1B, T2, T1z, T2N, T2O;
Chris@82 492 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 493 T1B = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@82 494 T1C = BYTW(&(W[TWVL * 46]), T1B);
Chris@82 495 T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 496 T3 = BYTW(&(W[TWVL * 30]), T2);
Chris@82 497 T1z = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 498 T1A = BYTW(&(W[TWVL * 14]), T1z);
Chris@82 499 T4 = VSUB(T1, T3);
Chris@82 500 T1D = VSUB(T1A, T1C);
Chris@82 501 T2N = VADD(T1, T3);
Chris@82 502 T2O = VADD(T1A, T1C);
Chris@82 503 T2P = VSUB(T2N, T2O);
Chris@82 504 T3h = VADD(T2N, T2O);
Chris@82 505 }
Chris@82 506 {
Chris@82 507 V T6, Td, T8, Tb;
Chris@82 508 {
Chris@82 509 V T5, Tc, T7, Ta;
Chris@82 510 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 511 T6 = BYTW(&(W[TWVL * 6]), T5);
Chris@82 512 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 513 Td = BYTW(&(W[TWVL * 22]), Tc);
Chris@82 514 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@82 515 T8 = BYTW(&(W[TWVL * 38]), T7);
Chris@82 516 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@82 517 Tb = BYTW(&(W[TWVL * 54]), Ta);
Chris@82 518 }
Chris@82 519 {
Chris@82 520 V T9, Te, T2I, T2J;
Chris@82 521 T9 = VSUB(T6, T8);
Chris@82 522 Te = VSUB(Tb, Td);
Chris@82 523 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
Chris@82 524 T1y = VMUL(LDK(KP707106781), VSUB(T9, Te));
Chris@82 525 T2I = VADD(T6, T8);
Chris@82 526 T2J = VADD(Tb, Td);
Chris@82 527 T2K = VSUB(T2I, T2J);
Chris@82 528 T3i = VADD(T2I, T2J);
Chris@82 529 }
Chris@82 530 }
Chris@82 531 {
Chris@82 532 V Tt, TA, Tv, Ty;
Chris@82 533 {
Chris@82 534 V Ts, Tz, Tu, Tx;
Chris@82 535 Ts = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 536 Tt = BYTW(&(W[TWVL * 10]), Ts);
Chris@82 537 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 538 TA = BYTW(&(W[TWVL * 26]), Tz);
Chris@82 539 Tu = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@82 540 Tv = BYTW(&(W[TWVL * 42]), Tu);
Chris@82 541 Tx = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@82 542 Ty = BYTW(&(W[TWVL * 58]), Tx);
Chris@82 543 }
Chris@82 544 {
Chris@82 545 V Tw, TB, T2E, T2F;
Chris@82 546 Tw = VSUB(Tt, Tv);
Chris@82 547 TB = VSUB(Ty, TA);
Chris@82 548 TC = VFNMS(LDK(KP382683432), TB, VMUL(LDK(KP923879532), Tw));
Chris@82 549 T1w = VFMA(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
Chris@82 550 T2E = VADD(Ty, TA);
Chris@82 551 T2F = VADD(Tt, Tv);
Chris@82 552 T2G = VSUB(T2E, T2F);
Chris@82 553 T3e = VADD(T2E, T2F);
Chris@82 554 }
Chris@82 555 }
Chris@82 556 {
Chris@82 557 V Ti, Tp, Tk, Tn;
Chris@82 558 {
Chris@82 559 V Th, To, Tj, Tm;
Chris@82 560 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 561 Ti = BYTW(&(W[TWVL * 2]), Th);
Chris@82 562 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@82 563 Tp = BYTW(&(W[TWVL * 50]), To);
Chris@82 564 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 565 Tk = BYTW(&(W[TWVL * 34]), Tj);
Chris@82 566 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 567 Tn = BYTW(&(W[TWVL * 18]), Tm);
Chris@82 568 }
Chris@82 569 {
Chris@82 570 V Tl, Tq, T2B, T2C;
Chris@82 571 Tl = VSUB(Ti, Tk);
Chris@82 572 Tq = VSUB(Tn, Tp);
Chris@82 573 Tr = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
Chris@82 574 T1v = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
Chris@82 575 T2B = VADD(Ti, Tk);
Chris@82 576 T2C = VADD(Tn, Tp);
Chris@82 577 T2D = VSUB(T2B, T2C);
Chris@82 578 T3d = VADD(T2B, T2C);
Chris@82 579 }
Chris@82 580 }
Chris@82 581 {
Chris@82 582 V T1g, T1i, T1o, T1m, T1a, T1c, T1d, T15, T17, T18;
Chris@82 583 {
Chris@82 584 V T1f, T1h, T1n, T1l;
Chris@82 585 T1f = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 586 T1g = BYTW(&(W[TWVL * 12]), T1f);
Chris@82 587 T1h = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@82 588 T1i = BYTW(&(W[TWVL * 44]), T1h);
Chris@82 589 T1n = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 590 T1o = BYTW(&(W[TWVL * 28]), T1n);
Chris@82 591 T1l = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@82 592 T1m = BYTW(&(W[TWVL * 60]), T1l);
Chris@82 593 {
Chris@82 594 V T19, T1b, T14, T16;
Chris@82 595 T19 = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@82 596 T1a = BYTW(&(W[TWVL * 52]), T19);
Chris@82 597 T1b = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 598 T1c = BYTW(&(W[TWVL * 20]), T1b);
Chris@82 599 T1d = VSUB(T1a, T1c);
Chris@82 600 T14 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 601 T15 = BYTW(&(W[TWVL * 4]), T14);
Chris@82 602 T16 = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 603 T17 = BYTW(&(W[TWVL * 36]), T16);
Chris@82 604 T18 = VSUB(T15, T17);
Chris@82 605 }
Chris@82 606 }
Chris@82 607 {
Chris@82 608 V T1e, T1j, T2w, T2x;
Chris@82 609 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
Chris@82 610 T1j = VSUB(T1g, T1i);
Chris@82 611 T1k = VSUB(T1e, T1j);
Chris@82 612 T20 = VADD(T1j, T1e);
Chris@82 613 T2w = VADD(T15, T17);
Chris@82 614 T2x = VADD(T1a, T1c);
Chris@82 615 T2y = VSUB(T2w, T2x);
Chris@82 616 T3a = VADD(T2w, T2x);
Chris@82 617 }
Chris@82 618 {
Chris@82 619 V T1p, T1q, T2t, T2u;
Chris@82 620 T1p = VSUB(T1m, T1o);
Chris@82 621 T1q = VMUL(LDK(KP707106781), VADD(T18, T1d));
Chris@82 622 T1r = VSUB(T1p, T1q);
Chris@82 623 T21 = VADD(T1p, T1q);
Chris@82 624 T2t = VADD(T1m, T1o);
Chris@82 625 T2u = VADD(T1g, T1i);
Chris@82 626 T2v = VSUB(T2t, T2u);
Chris@82 627 T39 = VADD(T2t, T2u);
Chris@82 628 }
Chris@82 629 }
Chris@82 630 {
Chris@82 631 V TR, TT, TZ, TX, TL, TN, TO, TG, TI, TJ;
Chris@82 632 {
Chris@82 633 V TQ, TS, TY, TW;
Chris@82 634 TQ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 635 TR = BYTW(&(W[TWVL * 16]), TQ);
Chris@82 636 TS = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@82 637 TT = BYTW(&(W[TWVL * 48]), TS);
Chris@82 638 TY = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 639 TZ = BYTW(&(W[TWVL * 32]), TY);
Chris@82 640 TW = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 641 TX = BYTW(&(W[0]), TW);
Chris@82 642 {
Chris@82 643 V TK, TM, TF, TH;
Chris@82 644 TK = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@82 645 TL = BYTW(&(W[TWVL * 56]), TK);
Chris@82 646 TM = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 647 TN = BYTW(&(W[TWVL * 24]), TM);
Chris@82 648 TO = VSUB(TL, TN);
Chris@82 649 TF = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 650 TG = BYTW(&(W[TWVL * 8]), TF);
Chris@82 651 TH = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@82 652 TI = BYTW(&(W[TWVL * 40]), TH);
Chris@82 653 TJ = VSUB(TG, TI);
Chris@82 654 }
Chris@82 655 }
Chris@82 656 {
Chris@82 657 V TP, TU, T2p, T2q;
Chris@82 658 TP = VMUL(LDK(KP707106781), VSUB(TJ, TO));
Chris@82 659 TU = VSUB(TR, TT);
Chris@82 660 TV = VSUB(TP, TU);
Chris@82 661 T1X = VADD(TU, TP);
Chris@82 662 T2p = VADD(TG, TI);
Chris@82 663 T2q = VADD(TL, TN);
Chris@82 664 T2r = VSUB(T2p, T2q);
Chris@82 665 T37 = VADD(T2p, T2q);
Chris@82 666 }
Chris@82 667 {
Chris@82 668 V T10, T11, T2m, T2n;
Chris@82 669 T10 = VSUB(TX, TZ);
Chris@82 670 T11 = VMUL(LDK(KP707106781), VADD(TJ, TO));
Chris@82 671 T12 = VSUB(T10, T11);
Chris@82 672 T1Y = VADD(T10, T11);
Chris@82 673 T2m = VADD(TX, TZ);
Chris@82 674 T2n = VADD(TR, TT);
Chris@82 675 T2o = VSUB(T2m, T2n);
Chris@82 676 T36 = VADD(T2m, T2n);
Chris@82 677 }
Chris@82 678 }
Chris@82 679 {
Chris@82 680 V T3q, T3u, T3t, T3v;
Chris@82 681 {
Chris@82 682 V T3o, T3p, T3r, T3s;
Chris@82 683 T3o = VADD(T3h, T3i);
Chris@82 684 T3p = VADD(T3d, T3e);
Chris@82 685 T3q = VSUB(T3o, T3p);
Chris@82 686 T3u = VADD(T3o, T3p);
Chris@82 687 T3r = VADD(T36, T37);
Chris@82 688 T3s = VADD(T39, T3a);
Chris@82 689 T3t = VBYI(VSUB(T3r, T3s));
Chris@82 690 T3v = VADD(T3r, T3s);
Chris@82 691 }
Chris@82 692 ST(&(x[WS(rs, 24)]), VSUB(T3q, T3t), ms, &(x[0]));
Chris@82 693 ST(&(x[0]), VADD(T3u, T3v), ms, &(x[0]));
Chris@82 694 ST(&(x[WS(rs, 8)]), VADD(T3q, T3t), ms, &(x[0]));
Chris@82 695 ST(&(x[WS(rs, 16)]), VSUB(T3u, T3v), ms, &(x[0]));
Chris@82 696 }
Chris@82 697 {
Chris@82 698 V T3f, T3j, T3c, T3k, T38, T3b;
Chris@82 699 T3f = VSUB(T3d, T3e);
Chris@82 700 T3j = VSUB(T3h, T3i);
Chris@82 701 T38 = VSUB(T36, T37);
Chris@82 702 T3b = VSUB(T39, T3a);
Chris@82 703 T3c = VMUL(LDK(KP707106781), VSUB(T38, T3b));
Chris@82 704 T3k = VMUL(LDK(KP707106781), VADD(T38, T3b));
Chris@82 705 {
Chris@82 706 V T3g, T3l, T3m, T3n;
Chris@82 707 T3g = VBYI(VSUB(T3c, T3f));
Chris@82 708 T3l = VSUB(T3j, T3k);
Chris@82 709 ST(&(x[WS(rs, 12)]), VADD(T3g, T3l), ms, &(x[0]));
Chris@82 710 ST(&(x[WS(rs, 20)]), VSUB(T3l, T3g), ms, &(x[0]));
Chris@82 711 T3m = VBYI(VADD(T3f, T3c));
Chris@82 712 T3n = VADD(T3j, T3k);
Chris@82 713 ST(&(x[WS(rs, 4)]), VADD(T3m, T3n), ms, &(x[0]));
Chris@82 714 ST(&(x[WS(rs, 28)]), VSUB(T3n, T3m), ms, &(x[0]));
Chris@82 715 }
Chris@82 716 }
Chris@82 717 {
Chris@82 718 V T2L, T31, T2R, T2Y, T2A, T2Z, T2U, T32, T2H, T2Q;
Chris@82 719 T2H = VMUL(LDK(KP707106781), VSUB(T2D, T2G));
Chris@82 720 T2L = VSUB(T2H, T2K);
Chris@82 721 T31 = VADD(T2K, T2H);
Chris@82 722 T2Q = VMUL(LDK(KP707106781), VADD(T2D, T2G));
Chris@82 723 T2R = VSUB(T2P, T2Q);
Chris@82 724 T2Y = VADD(T2P, T2Q);
Chris@82 725 {
Chris@82 726 V T2s, T2z, T2S, T2T;
Chris@82 727 T2s = VFNMS(LDK(KP382683432), T2r, VMUL(LDK(KP923879532), T2o));
Chris@82 728 T2z = VFMA(LDK(KP923879532), T2v, VMUL(LDK(KP382683432), T2y));
Chris@82 729 T2A = VSUB(T2s, T2z);
Chris@82 730 T2Z = VADD(T2s, T2z);
Chris@82 731 T2S = VFMA(LDK(KP382683432), T2o, VMUL(LDK(KP923879532), T2r));
Chris@82 732 T2T = VFNMS(LDK(KP382683432), T2v, VMUL(LDK(KP923879532), T2y));
Chris@82 733 T2U = VSUB(T2S, T2T);
Chris@82 734 T32 = VADD(T2S, T2T);
Chris@82 735 }
Chris@82 736 {
Chris@82 737 V T2M, T2V, T34, T35;
Chris@82 738 T2M = VBYI(VSUB(T2A, T2L));
Chris@82 739 T2V = VSUB(T2R, T2U);
Chris@82 740 ST(&(x[WS(rs, 10)]), VADD(T2M, T2V), ms, &(x[0]));
Chris@82 741 ST(&(x[WS(rs, 22)]), VSUB(T2V, T2M), ms, &(x[0]));
Chris@82 742 T34 = VSUB(T2Y, T2Z);
Chris@82 743 T35 = VBYI(VSUB(T32, T31));
Chris@82 744 ST(&(x[WS(rs, 18)]), VSUB(T34, T35), ms, &(x[0]));
Chris@82 745 ST(&(x[WS(rs, 14)]), VADD(T34, T35), ms, &(x[0]));
Chris@82 746 }
Chris@82 747 {
Chris@82 748 V T2W, T2X, T30, T33;
Chris@82 749 T2W = VBYI(VADD(T2L, T2A));
Chris@82 750 T2X = VADD(T2R, T2U);
Chris@82 751 ST(&(x[WS(rs, 6)]), VADD(T2W, T2X), ms, &(x[0]));
Chris@82 752 ST(&(x[WS(rs, 26)]), VSUB(T2X, T2W), ms, &(x[0]));
Chris@82 753 T30 = VADD(T2Y, T2Z);
Chris@82 754 T33 = VBYI(VADD(T31, T32));
Chris@82 755 ST(&(x[WS(rs, 30)]), VSUB(T30, T33), ms, &(x[0]));
Chris@82 756 ST(&(x[WS(rs, 2)]), VADD(T30, T33), ms, &(x[0]));
Chris@82 757 }
Chris@82 758 }
Chris@82 759 {
Chris@82 760 V TE, T1P, T1I, T1Q, T1t, T1M, T1F, T1N;
Chris@82 761 {
Chris@82 762 V Tg, TD, T1G, T1H;
Chris@82 763 Tg = VSUB(T4, Tf);
Chris@82 764 TD = VSUB(Tr, TC);
Chris@82 765 TE = VSUB(Tg, TD);
Chris@82 766 T1P = VADD(Tg, TD);
Chris@82 767 T1G = VFNMS(LDK(KP555570233), TV, VMUL(LDK(KP831469612), T12));
Chris@82 768 T1H = VFMA(LDK(KP555570233), T1k, VMUL(LDK(KP831469612), T1r));
Chris@82 769 T1I = VSUB(T1G, T1H);
Chris@82 770 T1Q = VADD(T1G, T1H);
Chris@82 771 }
Chris@82 772 {
Chris@82 773 V T13, T1s, T1x, T1E;
Chris@82 774 T13 = VFMA(LDK(KP831469612), TV, VMUL(LDK(KP555570233), T12));
Chris@82 775 T1s = VFNMS(LDK(KP555570233), T1r, VMUL(LDK(KP831469612), T1k));
Chris@82 776 T1t = VSUB(T13, T1s);
Chris@82 777 T1M = VADD(T13, T1s);
Chris@82 778 T1x = VSUB(T1v, T1w);
Chris@82 779 T1E = VSUB(T1y, T1D);
Chris@82 780 T1F = VSUB(T1x, T1E);
Chris@82 781 T1N = VADD(T1E, T1x);
Chris@82 782 }
Chris@82 783 {
Chris@82 784 V T1u, T1J, T1S, T1T;
Chris@82 785 T1u = VADD(TE, T1t);
Chris@82 786 T1J = VBYI(VADD(T1F, T1I));
Chris@82 787 ST(&(x[WS(rs, 27)]), VSUB(T1u, T1J), ms, &(x[WS(rs, 1)]));
Chris@82 788 ST(&(x[WS(rs, 5)]), VADD(T1u, T1J), ms, &(x[WS(rs, 1)]));
Chris@82 789 T1S = VBYI(VADD(T1N, T1M));
Chris@82 790 T1T = VADD(T1P, T1Q);
Chris@82 791 ST(&(x[WS(rs, 3)]), VADD(T1S, T1T), ms, &(x[WS(rs, 1)]));
Chris@82 792 ST(&(x[WS(rs, 29)]), VSUB(T1T, T1S), ms, &(x[WS(rs, 1)]));
Chris@82 793 }
Chris@82 794 {
Chris@82 795 V T1K, T1L, T1O, T1R;
Chris@82 796 T1K = VSUB(TE, T1t);
Chris@82 797 T1L = VBYI(VSUB(T1I, T1F));
Chris@82 798 ST(&(x[WS(rs, 21)]), VSUB(T1K, T1L), ms, &(x[WS(rs, 1)]));
Chris@82 799 ST(&(x[WS(rs, 11)]), VADD(T1K, T1L), ms, &(x[WS(rs, 1)]));
Chris@82 800 T1O = VBYI(VSUB(T1M, T1N));
Chris@82 801 T1R = VSUB(T1P, T1Q);
Chris@82 802 ST(&(x[WS(rs, 13)]), VADD(T1O, T1R), ms, &(x[WS(rs, 1)]));
Chris@82 803 ST(&(x[WS(rs, 19)]), VSUB(T1R, T1O), ms, &(x[WS(rs, 1)]));
Chris@82 804 }
Chris@82 805 }
Chris@82 806 {
Chris@82 807 V T1W, T2h, T2a, T2i, T23, T2e, T27, T2f;
Chris@82 808 {
Chris@82 809 V T1U, T1V, T28, T29;
Chris@82 810 T1U = VADD(T4, Tf);
Chris@82 811 T1V = VADD(T1v, T1w);
Chris@82 812 T1W = VSUB(T1U, T1V);
Chris@82 813 T2h = VADD(T1U, T1V);
Chris@82 814 T28 = VFNMS(LDK(KP195090322), T1X, VMUL(LDK(KP980785280), T1Y));
Chris@82 815 T29 = VFMA(LDK(KP195090322), T20, VMUL(LDK(KP980785280), T21));
Chris@82 816 T2a = VSUB(T28, T29);
Chris@82 817 T2i = VADD(T28, T29);
Chris@82 818 }
Chris@82 819 {
Chris@82 820 V T1Z, T22, T25, T26;
Chris@82 821 T1Z = VFMA(LDK(KP980785280), T1X, VMUL(LDK(KP195090322), T1Y));
Chris@82 822 T22 = VFNMS(LDK(KP195090322), T21, VMUL(LDK(KP980785280), T20));
Chris@82 823 T23 = VSUB(T1Z, T22);
Chris@82 824 T2e = VADD(T1Z, T22);
Chris@82 825 T25 = VADD(Tr, TC);
Chris@82 826 T26 = VADD(T1D, T1y);
Chris@82 827 T27 = VSUB(T25, T26);
Chris@82 828 T2f = VADD(T26, T25);
Chris@82 829 }
Chris@82 830 {
Chris@82 831 V T24, T2b, T2k, T2l;
Chris@82 832 T24 = VADD(T1W, T23);
Chris@82 833 T2b = VBYI(VADD(T27, T2a));
Chris@82 834 ST(&(x[WS(rs, 25)]), VSUB(T24, T2b), ms, &(x[WS(rs, 1)]));
Chris@82 835 ST(&(x[WS(rs, 7)]), VADD(T24, T2b), ms, &(x[WS(rs, 1)]));
Chris@82 836 T2k = VBYI(VADD(T2f, T2e));
Chris@82 837 T2l = VADD(T2h, T2i);
Chris@82 838 ST(&(x[WS(rs, 1)]), VADD(T2k, T2l), ms, &(x[WS(rs, 1)]));
Chris@82 839 ST(&(x[WS(rs, 31)]), VSUB(T2l, T2k), ms, &(x[WS(rs, 1)]));
Chris@82 840 }
Chris@82 841 {
Chris@82 842 V T2c, T2d, T2g, T2j;
Chris@82 843 T2c = VSUB(T1W, T23);
Chris@82 844 T2d = VBYI(VSUB(T2a, T27));
Chris@82 845 ST(&(x[WS(rs, 23)]), VSUB(T2c, T2d), ms, &(x[WS(rs, 1)]));
Chris@82 846 ST(&(x[WS(rs, 9)]), VADD(T2c, T2d), ms, &(x[WS(rs, 1)]));
Chris@82 847 T2g = VBYI(VSUB(T2e, T2f));
Chris@82 848 T2j = VSUB(T2h, T2i);
Chris@82 849 ST(&(x[WS(rs, 15)]), VADD(T2g, T2j), ms, &(x[WS(rs, 1)]));
Chris@82 850 ST(&(x[WS(rs, 17)]), VSUB(T2j, T2g), ms, &(x[WS(rs, 1)]));
Chris@82 851 }
Chris@82 852 }
Chris@82 853 }
Chris@82 854 }
Chris@82 855 VLEAVE();
Chris@82 856 }
Chris@82 857
Chris@82 858 static const tw_instr twinstr[] = {
Chris@82 859 VTW(0, 1),
Chris@82 860 VTW(0, 2),
Chris@82 861 VTW(0, 3),
Chris@82 862 VTW(0, 4),
Chris@82 863 VTW(0, 5),
Chris@82 864 VTW(0, 6),
Chris@82 865 VTW(0, 7),
Chris@82 866 VTW(0, 8),
Chris@82 867 VTW(0, 9),
Chris@82 868 VTW(0, 10),
Chris@82 869 VTW(0, 11),
Chris@82 870 VTW(0, 12),
Chris@82 871 VTW(0, 13),
Chris@82 872 VTW(0, 14),
Chris@82 873 VTW(0, 15),
Chris@82 874 VTW(0, 16),
Chris@82 875 VTW(0, 17),
Chris@82 876 VTW(0, 18),
Chris@82 877 VTW(0, 19),
Chris@82 878 VTW(0, 20),
Chris@82 879 VTW(0, 21),
Chris@82 880 VTW(0, 22),
Chris@82 881 VTW(0, 23),
Chris@82 882 VTW(0, 24),
Chris@82 883 VTW(0, 25),
Chris@82 884 VTW(0, 26),
Chris@82 885 VTW(0, 27),
Chris@82 886 VTW(0, 28),
Chris@82 887 VTW(0, 29),
Chris@82 888 VTW(0, 30),
Chris@82 889 VTW(0, 31),
Chris@82 890 {TW_NEXT, VL, 0}
Chris@82 891 };
Chris@82 892
Chris@82 893 static const ct_desc desc = { 32, XSIMD_STRING("t2bv_32"), twinstr, &GENUS, {201, 88, 16, 0}, 0, 0, 0 };
Chris@82 894
Chris@82 895 void XSIMD(codelet_t2bv_32) (planner *p) {
Chris@82 896 X(kdft_dit_register) (p, t2bv_32, &desc);
Chris@82 897 }
Chris@82 898 #endif