annotate src/fftw-3.3.5/dft/simd/common/t2bv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:44:30 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2bv_32 -include t2b.h -sign 1 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 217 FP additions, 160 FP multiplications,
Chris@42 32 * (or, 119 additions, 62 multiplications, 98 fused multiply/add),
Chris@42 33 * 104 stack variables, 7 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "t2b.h"
Chris@42 36
Chris@42 37 static void t2bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 40 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 41 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 44 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 46 {
Chris@42 47 INT m;
Chris@42 48 R *x;
Chris@42 49 x = ii;
Chris@42 50 for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 51 V T26, T25, T2a, T2i, T24, T2c, T2g, T2k, T2h, T27;
Chris@42 52 {
Chris@42 53 V T4, T1z, T2o, T32, T2r, T3f, Tf, T1A, T34, T2O, T1D, TC, T33, T2L, T1C;
Chris@42 54 V Tr, T2C, T3a, T2F, T3b, T1r, T21, T1k, T20, TQ, TM, TS, TL, T2t, TJ;
Chris@42 55 V T10, T2u;
Chris@42 56 {
Chris@42 57 V Tt, T9, T2p, Te, T2q, TA, Tu, Tx;
Chris@42 58 {
Chris@42 59 V T1, T1x, T2, T1v;
Chris@42 60 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 61 T1x = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@42 62 T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@42 63 T1v = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 64 {
Chris@42 65 V T5, Tc, T7, Ta, T2m, T2n;
Chris@42 66 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 67 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 68 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@42 69 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@42 70 {
Chris@42 71 V T1y, T3, T1w, T6, Td, T8, Tb, Ts, Tz;
Chris@42 72 Ts = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@42 73 T1y = BYTW(&(W[TWVL * 46]), T1x);
Chris@42 74 T3 = BYTW(&(W[TWVL * 30]), T2);
Chris@42 75 T1w = BYTW(&(W[TWVL * 14]), T1v);
Chris@42 76 T6 = BYTW(&(W[TWVL * 6]), T5);
Chris@42 77 Td = BYTW(&(W[TWVL * 22]), Tc);
Chris@42 78 T8 = BYTW(&(W[TWVL * 38]), T7);
Chris@42 79 Tb = BYTW(&(W[TWVL * 54]), Ta);
Chris@42 80 Tt = BYTW(&(W[TWVL * 58]), Ts);
Chris@42 81 Tz = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 82 T4 = VSUB(T1, T3);
Chris@42 83 T2m = VADD(T1, T3);
Chris@42 84 T1z = VSUB(T1w, T1y);
Chris@42 85 T2n = VADD(T1w, T1y);
Chris@42 86 T9 = VSUB(T6, T8);
Chris@42 87 T2p = VADD(T6, T8);
Chris@42 88 Te = VSUB(Tb, Td);
Chris@42 89 T2q = VADD(Tb, Td);
Chris@42 90 TA = BYTW(&(W[TWVL * 10]), Tz);
Chris@42 91 }
Chris@42 92 Tu = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 93 T2o = VADD(T2m, T2n);
Chris@42 94 T32 = VSUB(T2m, T2n);
Chris@42 95 Tx = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@42 96 }
Chris@42 97 }
Chris@42 98 {
Chris@42 99 V Tv, To, Ty, Ti, Tj, Tm, Th;
Chris@42 100 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 101 T2r = VADD(T2p, T2q);
Chris@42 102 T3f = VSUB(T2p, T2q);
Chris@42 103 Tf = VADD(T9, Te);
Chris@42 104 T1A = VSUB(T9, Te);
Chris@42 105 Tv = BYTW(&(W[TWVL * 26]), Tu);
Chris@42 106 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@42 107 Ty = BYTW(&(W[TWVL * 42]), Tx);
Chris@42 108 Ti = BYTW(&(W[TWVL * 2]), Th);
Chris@42 109 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@42 110 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 111 {
Chris@42 112 V T1f, T1h, T1a, T1c, T18, T2A, T2B, T1p;
Chris@42 113 {
Chris@42 114 V T15, T17, T1o, T1m;
Chris@42 115 {
Chris@42 116 V Tw, T2M, Tp, T2N, TB, Tk, Tn, T1n, T14, T16;
Chris@42 117 T14 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@42 118 T16 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 119 Tw = VSUB(Tt, Tv);
Chris@42 120 T2M = VADD(Tt, Tv);
Chris@42 121 Tp = BYTW(&(W[TWVL * 50]), To);
Chris@42 122 T2N = VADD(TA, Ty);
Chris@42 123 TB = VSUB(Ty, TA);
Chris@42 124 Tk = BYTW(&(W[TWVL * 34]), Tj);
Chris@42 125 Tn = BYTW(&(W[TWVL * 18]), Tm);
Chris@42 126 T15 = BYTW(&(W[TWVL * 60]), T14);
Chris@42 127 T17 = BYTW(&(W[TWVL * 28]), T16);
Chris@42 128 T1n = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 129 {
Chris@42 130 V T2J, Tl, T2K, Tq, T1l;
Chris@42 131 T1l = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@42 132 T34 = VSUB(T2M, T2N);
Chris@42 133 T2O = VADD(T2M, T2N);
Chris@42 134 T1D = VFMA(LDK(KP414213562), Tw, TB);
Chris@42 135 TC = VFNMS(LDK(KP414213562), TB, Tw);
Chris@42 136 T2J = VADD(Ti, Tk);
Chris@42 137 Tl = VSUB(Ti, Tk);
Chris@42 138 T2K = VADD(Tn, Tp);
Chris@42 139 Tq = VSUB(Tn, Tp);
Chris@42 140 T1o = BYTW(&(W[TWVL * 12]), T1n);
Chris@42 141 T1m = BYTW(&(W[TWVL * 44]), T1l);
Chris@42 142 {
Chris@42 143 V T1e, T1g, T19, T1b;
Chris@42 144 T1e = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@42 145 T1g = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 146 T19 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 147 T1b = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@42 148 T33 = VSUB(T2J, T2K);
Chris@42 149 T2L = VADD(T2J, T2K);
Chris@42 150 T1C = VFMA(LDK(KP414213562), Tl, Tq);
Chris@42 151 Tr = VFNMS(LDK(KP414213562), Tq, Tl);
Chris@42 152 T1f = BYTW(&(W[TWVL * 52]), T1e);
Chris@42 153 T1h = BYTW(&(W[TWVL * 20]), T1g);
Chris@42 154 T1a = BYTW(&(W[TWVL * 4]), T19);
Chris@42 155 T1c = BYTW(&(W[TWVL * 36]), T1b);
Chris@42 156 }
Chris@42 157 }
Chris@42 158 }
Chris@42 159 T18 = VSUB(T15, T17);
Chris@42 160 T2A = VADD(T15, T17);
Chris@42 161 T2B = VADD(T1o, T1m);
Chris@42 162 T1p = VSUB(T1m, T1o);
Chris@42 163 }
Chris@42 164 {
Chris@42 165 V TG, TI, TZ, TX;
Chris@42 166 {
Chris@42 167 V T1i, T2E, T1d, T2D, TH, TY, TF;
Chris@42 168 TF = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 169 T1i = VSUB(T1f, T1h);
Chris@42 170 T2E = VADD(T1f, T1h);
Chris@42 171 T1d = VSUB(T1a, T1c);
Chris@42 172 T2D = VADD(T1a, T1c);
Chris@42 173 TH = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@42 174 TY = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@42 175 T2C = VADD(T2A, T2B);
Chris@42 176 T3a = VSUB(T2A, T2B);
Chris@42 177 TG = BYTW(&(W[0]), TF);
Chris@42 178 {
Chris@42 179 V TW, T1j, T1q, TP, TR, TK;
Chris@42 180 TW = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 181 T2F = VADD(T2D, T2E);
Chris@42 182 T3b = VSUB(T2E, T2D);
Chris@42 183 T1j = VADD(T1d, T1i);
Chris@42 184 T1q = VSUB(T1i, T1d);
Chris@42 185 TI = BYTW(&(W[TWVL * 32]), TH);
Chris@42 186 TZ = BYTW(&(W[TWVL * 48]), TY);
Chris@42 187 TP = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@42 188 TX = BYTW(&(W[TWVL * 16]), TW);
Chris@42 189 TR = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 190 TK = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 191 T1r = VFMA(LDK(KP707106781), T1q, T1p);
Chris@42 192 T21 = VFNMS(LDK(KP707106781), T1q, T1p);
Chris@42 193 T1k = VFMA(LDK(KP707106781), T1j, T18);
Chris@42 194 T20 = VFNMS(LDK(KP707106781), T1j, T18);
Chris@42 195 TQ = BYTW(&(W[TWVL * 56]), TP);
Chris@42 196 TM = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@42 197 TS = BYTW(&(W[TWVL * 24]), TR);
Chris@42 198 TL = BYTW(&(W[TWVL * 8]), TK);
Chris@42 199 }
Chris@42 200 }
Chris@42 201 T2t = VADD(TG, TI);
Chris@42 202 TJ = VSUB(TG, TI);
Chris@42 203 T10 = VSUB(TX, TZ);
Chris@42 204 T2u = VADD(TX, TZ);
Chris@42 205 }
Chris@42 206 }
Chris@42 207 }
Chris@42 208 }
Chris@42 209 {
Chris@42 210 V T2s, TT, T2x, T2P, T2Y, T2G, T37, T2v, T2w, TO, T2W, T30, T2U, TN, T2V;
Chris@42 211 T2s = VSUB(T2o, T2r);
Chris@42 212 T2U = VADD(T2o, T2r);
Chris@42 213 TN = BYTW(&(W[TWVL * 40]), TM);
Chris@42 214 TT = VSUB(TQ, TS);
Chris@42 215 T2x = VADD(TQ, TS);
Chris@42 216 T2P = VSUB(T2L, T2O);
Chris@42 217 T2V = VADD(T2L, T2O);
Chris@42 218 T2Y = VADD(T2C, T2F);
Chris@42 219 T2G = VSUB(T2C, T2F);
Chris@42 220 T37 = VSUB(T2t, T2u);
Chris@42 221 T2v = VADD(T2t, T2u);
Chris@42 222 T2w = VADD(TL, TN);
Chris@42 223 TO = VSUB(TL, TN);
Chris@42 224 T2W = VSUB(T2U, T2V);
Chris@42 225 T30 = VADD(T2U, T2V);
Chris@42 226 {
Chris@42 227 V T1Y, T12, T1X, TV, T3n, T3t, T3m, T3q;
Chris@42 228 {
Chris@42 229 V T3o, T36, T3r, T3h, T3k, T3p, T3d, T3s, T2H, T2Q, T2Z, T31;
Chris@42 230 {
Chris@42 231 V T35, T3g, T38, T2y, T11, TU, T3c, T3j;
Chris@42 232 T35 = VADD(T33, T34);
Chris@42 233 T3g = VSUB(T33, T34);
Chris@42 234 T38 = VSUB(T2w, T2x);
Chris@42 235 T2y = VADD(T2w, T2x);
Chris@42 236 T11 = VSUB(TO, TT);
Chris@42 237 TU = VADD(TO, TT);
Chris@42 238 T3c = VFNMS(LDK(KP414213562), T3b, T3a);
Chris@42 239 T3j = VFMA(LDK(KP414213562), T3a, T3b);
Chris@42 240 T3o = VFNMS(LDK(KP707106781), T35, T32);
Chris@42 241 T36 = VFMA(LDK(KP707106781), T35, T32);
Chris@42 242 T3r = VFNMS(LDK(KP707106781), T3g, T3f);
Chris@42 243 T3h = VFMA(LDK(KP707106781), T3g, T3f);
Chris@42 244 {
Chris@42 245 V T3i, T39, T2z, T2X;
Chris@42 246 T3i = VFMA(LDK(KP414213562), T37, T38);
Chris@42 247 T39 = VFNMS(LDK(KP414213562), T38, T37);
Chris@42 248 T2z = VSUB(T2v, T2y);
Chris@42 249 T2X = VADD(T2v, T2y);
Chris@42 250 T1Y = VFNMS(LDK(KP707106781), T11, T10);
Chris@42 251 T12 = VFMA(LDK(KP707106781), T11, T10);
Chris@42 252 T1X = VFNMS(LDK(KP707106781), TU, TJ);
Chris@42 253 TV = VFMA(LDK(KP707106781), TU, TJ);
Chris@42 254 T3k = VSUB(T3i, T3j);
Chris@42 255 T3p = VADD(T3i, T3j);
Chris@42 256 T3d = VADD(T39, T3c);
Chris@42 257 T3s = VSUB(T39, T3c);
Chris@42 258 T2H = VADD(T2z, T2G);
Chris@42 259 T2Q = VSUB(T2z, T2G);
Chris@42 260 T2Z = VSUB(T2X, T2Y);
Chris@42 261 T31 = VADD(T2X, T2Y);
Chris@42 262 }
Chris@42 263 }
Chris@42 264 {
Chris@42 265 V T3v, T3u, T3l, T3e;
Chris@42 266 T3l = VFNMS(LDK(KP923879532), T3k, T3h);
Chris@42 267 T3n = VFMA(LDK(KP923879532), T3k, T3h);
Chris@42 268 T3t = VFMA(LDK(KP923879532), T3s, T3r);
Chris@42 269 T3v = VFNMS(LDK(KP923879532), T3s, T3r);
Chris@42 270 T3e = VFNMS(LDK(KP923879532), T3d, T36);
Chris@42 271 T3m = VFMA(LDK(KP923879532), T3d, T36);
Chris@42 272 {
Chris@42 273 V T2R, T2T, T2I, T2S;
Chris@42 274 T2R = VFNMS(LDK(KP707106781), T2Q, T2P);
Chris@42 275 T2T = VFMA(LDK(KP707106781), T2Q, T2P);
Chris@42 276 T2I = VFNMS(LDK(KP707106781), T2H, T2s);
Chris@42 277 T2S = VFMA(LDK(KP707106781), T2H, T2s);
Chris@42 278 ST(&(x[WS(rs, 16)]), VSUB(T30, T31), ms, &(x[0]));
Chris@42 279 ST(&(x[0]), VADD(T30, T31), ms, &(x[0]));
Chris@42 280 ST(&(x[WS(rs, 8)]), VFMAI(T2Z, T2W), ms, &(x[0]));
Chris@42 281 ST(&(x[WS(rs, 24)]), VFNMSI(T2Z, T2W), ms, &(x[0]));
Chris@42 282 T3q = VFNMS(LDK(KP923879532), T3p, T3o);
Chris@42 283 T3u = VFMA(LDK(KP923879532), T3p, T3o);
Chris@42 284 ST(&(x[WS(rs, 18)]), VFMAI(T3l, T3e), ms, &(x[0]));
Chris@42 285 ST(&(x[WS(rs, 14)]), VFNMSI(T3l, T3e), ms, &(x[0]));
Chris@42 286 ST(&(x[WS(rs, 28)]), VFNMSI(T2T, T2S), ms, &(x[0]));
Chris@42 287 ST(&(x[WS(rs, 4)]), VFMAI(T2T, T2S), ms, &(x[0]));
Chris@42 288 ST(&(x[WS(rs, 20)]), VFMAI(T2R, T2I), ms, &(x[0]));
Chris@42 289 ST(&(x[WS(rs, 12)]), VFNMSI(T2R, T2I), ms, &(x[0]));
Chris@42 290 }
Chris@42 291 ST(&(x[WS(rs, 26)]), VFMAI(T3v, T3u), ms, &(x[0]));
Chris@42 292 ST(&(x[WS(rs, 6)]), VFNMSI(T3v, T3u), ms, &(x[0]));
Chris@42 293 }
Chris@42 294 }
Chris@42 295 {
Chris@42 296 V T1U, T13, T1s, TE, T1M, T1I, T1N, T1B, T1V, T1E;
Chris@42 297 {
Chris@42 298 V Tg, TD, T1G, T1H;
Chris@42 299 Tg = VFMA(LDK(KP707106781), Tf, T4);
Chris@42 300 T1U = VFNMS(LDK(KP707106781), Tf, T4);
Chris@42 301 T26 = VSUB(Tr, TC);
Chris@42 302 TD = VADD(Tr, TC);
Chris@42 303 T1G = VFMA(LDK(KP198912367), TV, T12);
Chris@42 304 T13 = VFNMS(LDK(KP198912367), T12, TV);
Chris@42 305 T1s = VFNMS(LDK(KP198912367), T1r, T1k);
Chris@42 306 T1H = VFMA(LDK(KP198912367), T1k, T1r);
Chris@42 307 ST(&(x[WS(rs, 2)]), VFMAI(T3n, T3m), ms, &(x[0]));
Chris@42 308 ST(&(x[WS(rs, 30)]), VFNMSI(T3n, T3m), ms, &(x[0]));
Chris@42 309 ST(&(x[WS(rs, 22)]), VFNMSI(T3t, T3q), ms, &(x[0]));
Chris@42 310 ST(&(x[WS(rs, 10)]), VFMAI(T3t, T3q), ms, &(x[0]));
Chris@42 311 TE = VFMA(LDK(KP923879532), TD, Tg);
Chris@42 312 T1M = VFNMS(LDK(KP923879532), TD, Tg);
Chris@42 313 T1I = VSUB(T1G, T1H);
Chris@42 314 T1N = VADD(T1G, T1H);
Chris@42 315 T1B = VFMA(LDK(KP707106781), T1A, T1z);
Chris@42 316 T25 = VFNMS(LDK(KP707106781), T1A, T1z);
Chris@42 317 T1V = VADD(T1C, T1D);
Chris@42 318 T1E = VSUB(T1C, T1D);
Chris@42 319 }
Chris@42 320 {
Chris@42 321 V T1W, T2e, T2f, T23;
Chris@42 322 {
Chris@42 323 V T28, T1Z, T1S, T1O, T1t, T1Q, T1F, T1P, T22, T29;
Chris@42 324 T28 = VFNMS(LDK(KP668178637), T1X, T1Y);
Chris@42 325 T1Z = VFMA(LDK(KP668178637), T1Y, T1X);
Chris@42 326 T1S = VFMA(LDK(KP980785280), T1N, T1M);
Chris@42 327 T1O = VFNMS(LDK(KP980785280), T1N, T1M);
Chris@42 328 T1t = VADD(T13, T1s);
Chris@42 329 T1Q = VSUB(T13, T1s);
Chris@42 330 T1F = VFMA(LDK(KP923879532), T1E, T1B);
Chris@42 331 T1P = VFNMS(LDK(KP923879532), T1E, T1B);
Chris@42 332 T1W = VFMA(LDK(KP923879532), T1V, T1U);
Chris@42 333 T2e = VFNMS(LDK(KP923879532), T1V, T1U);
Chris@42 334 T22 = VFMA(LDK(KP668178637), T21, T20);
Chris@42 335 T29 = VFNMS(LDK(KP668178637), T20, T21);
Chris@42 336 {
Chris@42 337 V T1K, T1u, T1R, T1T, T1L, T1J;
Chris@42 338 T1K = VFMA(LDK(KP980785280), T1t, TE);
Chris@42 339 T1u = VFNMS(LDK(KP980785280), T1t, TE);
Chris@42 340 T1R = VFMA(LDK(KP980785280), T1Q, T1P);
Chris@42 341 T1T = VFNMS(LDK(KP980785280), T1Q, T1P);
Chris@42 342 T1L = VFMA(LDK(KP980785280), T1I, T1F);
Chris@42 343 T1J = VFNMS(LDK(KP980785280), T1I, T1F);
Chris@42 344 T2f = VADD(T28, T29);
Chris@42 345 T2a = VSUB(T28, T29);
Chris@42 346 T23 = VADD(T1Z, T22);
Chris@42 347 T2i = VSUB(T1Z, T22);
Chris@42 348 ST(&(x[WS(rs, 23)]), VFNMSI(T1R, T1O), ms, &(x[WS(rs, 1)]));
Chris@42 349 ST(&(x[WS(rs, 9)]), VFMAI(T1R, T1O), ms, &(x[WS(rs, 1)]));
Chris@42 350 ST(&(x[WS(rs, 25)]), VFMAI(T1T, T1S), ms, &(x[WS(rs, 1)]));
Chris@42 351 ST(&(x[WS(rs, 7)]), VFNMSI(T1T, T1S), ms, &(x[WS(rs, 1)]));
Chris@42 352 ST(&(x[WS(rs, 1)]), VFMAI(T1L, T1K), ms, &(x[WS(rs, 1)]));
Chris@42 353 ST(&(x[WS(rs, 31)]), VFNMSI(T1L, T1K), ms, &(x[WS(rs, 1)]));
Chris@42 354 ST(&(x[WS(rs, 17)]), VFMAI(T1J, T1u), ms, &(x[WS(rs, 1)]));
Chris@42 355 ST(&(x[WS(rs, 15)]), VFNMSI(T1J, T1u), ms, &(x[WS(rs, 1)]));
Chris@42 356 }
Chris@42 357 }
Chris@42 358 T24 = VFNMS(LDK(KP831469612), T23, T1W);
Chris@42 359 T2c = VFMA(LDK(KP831469612), T23, T1W);
Chris@42 360 T2g = VFMA(LDK(KP831469612), T2f, T2e);
Chris@42 361 T2k = VFNMS(LDK(KP831469612), T2f, T2e);
Chris@42 362 }
Chris@42 363 }
Chris@42 364 }
Chris@42 365 }
Chris@42 366 }
Chris@42 367 T2h = VFMA(LDK(KP923879532), T26, T25);
Chris@42 368 T27 = VFNMS(LDK(KP923879532), T26, T25);
Chris@42 369 {
Chris@42 370 V T2j, T2l, T2d, T2b;
Chris@42 371 T2j = VFNMS(LDK(KP831469612), T2i, T2h);
Chris@42 372 T2l = VFMA(LDK(KP831469612), T2i, T2h);
Chris@42 373 T2d = VFMA(LDK(KP831469612), T2a, T27);
Chris@42 374 T2b = VFNMS(LDK(KP831469612), T2a, T27);
Chris@42 375 ST(&(x[WS(rs, 21)]), VFMAI(T2j, T2g), ms, &(x[WS(rs, 1)]));
Chris@42 376 ST(&(x[WS(rs, 11)]), VFNMSI(T2j, T2g), ms, &(x[WS(rs, 1)]));
Chris@42 377 ST(&(x[WS(rs, 27)]), VFNMSI(T2l, T2k), ms, &(x[WS(rs, 1)]));
Chris@42 378 ST(&(x[WS(rs, 5)]), VFMAI(T2l, T2k), ms, &(x[WS(rs, 1)]));
Chris@42 379 ST(&(x[WS(rs, 29)]), VFMAI(T2d, T2c), ms, &(x[WS(rs, 1)]));
Chris@42 380 ST(&(x[WS(rs, 3)]), VFNMSI(T2d, T2c), ms, &(x[WS(rs, 1)]));
Chris@42 381 ST(&(x[WS(rs, 13)]), VFMAI(T2b, T24), ms, &(x[WS(rs, 1)]));
Chris@42 382 ST(&(x[WS(rs, 19)]), VFNMSI(T2b, T24), ms, &(x[WS(rs, 1)]));
Chris@42 383 }
Chris@42 384 }
Chris@42 385 }
Chris@42 386 VLEAVE();
Chris@42 387 }
Chris@42 388
Chris@42 389 static const tw_instr twinstr[] = {
Chris@42 390 VTW(0, 1),
Chris@42 391 VTW(0, 2),
Chris@42 392 VTW(0, 3),
Chris@42 393 VTW(0, 4),
Chris@42 394 VTW(0, 5),
Chris@42 395 VTW(0, 6),
Chris@42 396 VTW(0, 7),
Chris@42 397 VTW(0, 8),
Chris@42 398 VTW(0, 9),
Chris@42 399 VTW(0, 10),
Chris@42 400 VTW(0, 11),
Chris@42 401 VTW(0, 12),
Chris@42 402 VTW(0, 13),
Chris@42 403 VTW(0, 14),
Chris@42 404 VTW(0, 15),
Chris@42 405 VTW(0, 16),
Chris@42 406 VTW(0, 17),
Chris@42 407 VTW(0, 18),
Chris@42 408 VTW(0, 19),
Chris@42 409 VTW(0, 20),
Chris@42 410 VTW(0, 21),
Chris@42 411 VTW(0, 22),
Chris@42 412 VTW(0, 23),
Chris@42 413 VTW(0, 24),
Chris@42 414 VTW(0, 25),
Chris@42 415 VTW(0, 26),
Chris@42 416 VTW(0, 27),
Chris@42 417 VTW(0, 28),
Chris@42 418 VTW(0, 29),
Chris@42 419 VTW(0, 30),
Chris@42 420 VTW(0, 31),
Chris@42 421 {TW_NEXT, VL, 0}
Chris@42 422 };
Chris@42 423
Chris@42 424 static const ct_desc desc = { 32, XSIMD_STRING("t2bv_32"), twinstr, &GENUS, {119, 62, 98, 0}, 0, 0, 0 };
Chris@42 425
Chris@42 426 void XSIMD(codelet_t2bv_32) (planner *p) {
Chris@42 427 X(kdft_dit_register) (p, t2bv_32, &desc);
Chris@42 428 }
Chris@42 429 #else /* HAVE_FMA */
Chris@42 430
Chris@42 431 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t2bv_32 -include t2b.h -sign 1 */
Chris@42 432
Chris@42 433 /*
Chris@42 434 * This function contains 217 FP additions, 104 FP multiplications,
Chris@42 435 * (or, 201 additions, 88 multiplications, 16 fused multiply/add),
Chris@42 436 * 59 stack variables, 7 constants, and 64 memory accesses
Chris@42 437 */
Chris@42 438 #include "t2b.h"
Chris@42 439
Chris@42 440 static void t2bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 441 {
Chris@42 442 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@42 443 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 444 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@42 445 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 446 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 447 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 448 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 449 {
Chris@42 450 INT m;
Chris@42 451 R *x;
Chris@42 452 x = ii;
Chris@42 453 for (m = mb, W = W + (mb * ((TWVL / VL) * 62)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 454 V T4, T1D, T2P, T3h, Tf, T1y, T2K, T3i, TC, T1w, T2G, T3e, Tr, T1v, T2D;
Chris@42 455 V T3d, T1k, T20, T2y, T3a, T1r, T21, T2v, T39, TV, T1X, T2r, T37, T12, T1Y;
Chris@42 456 V T2o, T36;
Chris@42 457 {
Chris@42 458 V T1, T1C, T3, T1A, T1B, T2, T1z, T2N, T2O;
Chris@42 459 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 460 T1B = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@42 461 T1C = BYTW(&(W[TWVL * 46]), T1B);
Chris@42 462 T2 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@42 463 T3 = BYTW(&(W[TWVL * 30]), T2);
Chris@42 464 T1z = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 465 T1A = BYTW(&(W[TWVL * 14]), T1z);
Chris@42 466 T4 = VSUB(T1, T3);
Chris@42 467 T1D = VSUB(T1A, T1C);
Chris@42 468 T2N = VADD(T1, T3);
Chris@42 469 T2O = VADD(T1A, T1C);
Chris@42 470 T2P = VSUB(T2N, T2O);
Chris@42 471 T3h = VADD(T2N, T2O);
Chris@42 472 }
Chris@42 473 {
Chris@42 474 V T6, Td, T8, Tb;
Chris@42 475 {
Chris@42 476 V T5, Tc, T7, Ta;
Chris@42 477 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 478 T6 = BYTW(&(W[TWVL * 6]), T5);
Chris@42 479 Tc = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 480 Td = BYTW(&(W[TWVL * 22]), Tc);
Chris@42 481 T7 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@42 482 T8 = BYTW(&(W[TWVL * 38]), T7);
Chris@42 483 Ta = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@42 484 Tb = BYTW(&(W[TWVL * 54]), Ta);
Chris@42 485 }
Chris@42 486 {
Chris@42 487 V T9, Te, T2I, T2J;
Chris@42 488 T9 = VSUB(T6, T8);
Chris@42 489 Te = VSUB(Tb, Td);
Chris@42 490 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
Chris@42 491 T1y = VMUL(LDK(KP707106781), VSUB(T9, Te));
Chris@42 492 T2I = VADD(T6, T8);
Chris@42 493 T2J = VADD(Tb, Td);
Chris@42 494 T2K = VSUB(T2I, T2J);
Chris@42 495 T3i = VADD(T2I, T2J);
Chris@42 496 }
Chris@42 497 }
Chris@42 498 {
Chris@42 499 V Tt, TA, Tv, Ty;
Chris@42 500 {
Chris@42 501 V Ts, Tz, Tu, Tx;
Chris@42 502 Ts = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 503 Tt = BYTW(&(W[TWVL * 10]), Ts);
Chris@42 504 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 505 TA = BYTW(&(W[TWVL * 26]), Tz);
Chris@42 506 Tu = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@42 507 Tv = BYTW(&(W[TWVL * 42]), Tu);
Chris@42 508 Tx = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@42 509 Ty = BYTW(&(W[TWVL * 58]), Tx);
Chris@42 510 }
Chris@42 511 {
Chris@42 512 V Tw, TB, T2E, T2F;
Chris@42 513 Tw = VSUB(Tt, Tv);
Chris@42 514 TB = VSUB(Ty, TA);
Chris@42 515 TC = VFNMS(LDK(KP382683432), TB, VMUL(LDK(KP923879532), Tw));
Chris@42 516 T1w = VFMA(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
Chris@42 517 T2E = VADD(Ty, TA);
Chris@42 518 T2F = VADD(Tt, Tv);
Chris@42 519 T2G = VSUB(T2E, T2F);
Chris@42 520 T3e = VADD(T2E, T2F);
Chris@42 521 }
Chris@42 522 }
Chris@42 523 {
Chris@42 524 V Ti, Tp, Tk, Tn;
Chris@42 525 {
Chris@42 526 V Th, To, Tj, Tm;
Chris@42 527 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 528 Ti = BYTW(&(W[TWVL * 2]), Th);
Chris@42 529 To = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@42 530 Tp = BYTW(&(W[TWVL * 50]), To);
Chris@42 531 Tj = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@42 532 Tk = BYTW(&(W[TWVL * 34]), Tj);
Chris@42 533 Tm = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 534 Tn = BYTW(&(W[TWVL * 18]), Tm);
Chris@42 535 }
Chris@42 536 {
Chris@42 537 V Tl, Tq, T2B, T2C;
Chris@42 538 Tl = VSUB(Ti, Tk);
Chris@42 539 Tq = VSUB(Tn, Tp);
Chris@42 540 Tr = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
Chris@42 541 T1v = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
Chris@42 542 T2B = VADD(Ti, Tk);
Chris@42 543 T2C = VADD(Tn, Tp);
Chris@42 544 T2D = VSUB(T2B, T2C);
Chris@42 545 T3d = VADD(T2B, T2C);
Chris@42 546 }
Chris@42 547 }
Chris@42 548 {
Chris@42 549 V T1g, T1i, T1o, T1m, T1a, T1c, T1d, T15, T17, T18;
Chris@42 550 {
Chris@42 551 V T1f, T1h, T1n, T1l;
Chris@42 552 T1f = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 553 T1g = BYTW(&(W[TWVL * 12]), T1f);
Chris@42 554 T1h = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@42 555 T1i = BYTW(&(W[TWVL * 44]), T1h);
Chris@42 556 T1n = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 557 T1o = BYTW(&(W[TWVL * 28]), T1n);
Chris@42 558 T1l = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@42 559 T1m = BYTW(&(W[TWVL * 60]), T1l);
Chris@42 560 {
Chris@42 561 V T19, T1b, T14, T16;
Chris@42 562 T19 = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@42 563 T1a = BYTW(&(W[TWVL * 52]), T19);
Chris@42 564 T1b = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 565 T1c = BYTW(&(W[TWVL * 20]), T1b);
Chris@42 566 T1d = VSUB(T1a, T1c);
Chris@42 567 T14 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 568 T15 = BYTW(&(W[TWVL * 4]), T14);
Chris@42 569 T16 = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@42 570 T17 = BYTW(&(W[TWVL * 36]), T16);
Chris@42 571 T18 = VSUB(T15, T17);
Chris@42 572 }
Chris@42 573 }
Chris@42 574 {
Chris@42 575 V T1e, T1j, T2w, T2x;
Chris@42 576 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
Chris@42 577 T1j = VSUB(T1g, T1i);
Chris@42 578 T1k = VSUB(T1e, T1j);
Chris@42 579 T20 = VADD(T1j, T1e);
Chris@42 580 T2w = VADD(T15, T17);
Chris@42 581 T2x = VADD(T1a, T1c);
Chris@42 582 T2y = VSUB(T2w, T2x);
Chris@42 583 T3a = VADD(T2w, T2x);
Chris@42 584 }
Chris@42 585 {
Chris@42 586 V T1p, T1q, T2t, T2u;
Chris@42 587 T1p = VSUB(T1m, T1o);
Chris@42 588 T1q = VMUL(LDK(KP707106781), VADD(T18, T1d));
Chris@42 589 T1r = VSUB(T1p, T1q);
Chris@42 590 T21 = VADD(T1p, T1q);
Chris@42 591 T2t = VADD(T1m, T1o);
Chris@42 592 T2u = VADD(T1g, T1i);
Chris@42 593 T2v = VSUB(T2t, T2u);
Chris@42 594 T39 = VADD(T2t, T2u);
Chris@42 595 }
Chris@42 596 }
Chris@42 597 {
Chris@42 598 V TR, TT, TZ, TX, TL, TN, TO, TG, TI, TJ;
Chris@42 599 {
Chris@42 600 V TQ, TS, TY, TW;
Chris@42 601 TQ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 602 TR = BYTW(&(W[TWVL * 16]), TQ);
Chris@42 603 TS = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@42 604 TT = BYTW(&(W[TWVL * 48]), TS);
Chris@42 605 TY = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@42 606 TZ = BYTW(&(W[TWVL * 32]), TY);
Chris@42 607 TW = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 608 TX = BYTW(&(W[0]), TW);
Chris@42 609 {
Chris@42 610 V TK, TM, TF, TH;
Chris@42 611 TK = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@42 612 TL = BYTW(&(W[TWVL * 56]), TK);
Chris@42 613 TM = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 614 TN = BYTW(&(W[TWVL * 24]), TM);
Chris@42 615 TO = VSUB(TL, TN);
Chris@42 616 TF = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 617 TG = BYTW(&(W[TWVL * 8]), TF);
Chris@42 618 TH = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@42 619 TI = BYTW(&(W[TWVL * 40]), TH);
Chris@42 620 TJ = VSUB(TG, TI);
Chris@42 621 }
Chris@42 622 }
Chris@42 623 {
Chris@42 624 V TP, TU, T2p, T2q;
Chris@42 625 TP = VMUL(LDK(KP707106781), VSUB(TJ, TO));
Chris@42 626 TU = VSUB(TR, TT);
Chris@42 627 TV = VSUB(TP, TU);
Chris@42 628 T1X = VADD(TU, TP);
Chris@42 629 T2p = VADD(TG, TI);
Chris@42 630 T2q = VADD(TL, TN);
Chris@42 631 T2r = VSUB(T2p, T2q);
Chris@42 632 T37 = VADD(T2p, T2q);
Chris@42 633 }
Chris@42 634 {
Chris@42 635 V T10, T11, T2m, T2n;
Chris@42 636 T10 = VSUB(TX, TZ);
Chris@42 637 T11 = VMUL(LDK(KP707106781), VADD(TJ, TO));
Chris@42 638 T12 = VSUB(T10, T11);
Chris@42 639 T1Y = VADD(T10, T11);
Chris@42 640 T2m = VADD(TX, TZ);
Chris@42 641 T2n = VADD(TR, TT);
Chris@42 642 T2o = VSUB(T2m, T2n);
Chris@42 643 T36 = VADD(T2m, T2n);
Chris@42 644 }
Chris@42 645 }
Chris@42 646 {
Chris@42 647 V T3q, T3u, T3t, T3v;
Chris@42 648 {
Chris@42 649 V T3o, T3p, T3r, T3s;
Chris@42 650 T3o = VADD(T3h, T3i);
Chris@42 651 T3p = VADD(T3d, T3e);
Chris@42 652 T3q = VSUB(T3o, T3p);
Chris@42 653 T3u = VADD(T3o, T3p);
Chris@42 654 T3r = VADD(T36, T37);
Chris@42 655 T3s = VADD(T39, T3a);
Chris@42 656 T3t = VBYI(VSUB(T3r, T3s));
Chris@42 657 T3v = VADD(T3r, T3s);
Chris@42 658 }
Chris@42 659 ST(&(x[WS(rs, 24)]), VSUB(T3q, T3t), ms, &(x[0]));
Chris@42 660 ST(&(x[0]), VADD(T3u, T3v), ms, &(x[0]));
Chris@42 661 ST(&(x[WS(rs, 8)]), VADD(T3q, T3t), ms, &(x[0]));
Chris@42 662 ST(&(x[WS(rs, 16)]), VSUB(T3u, T3v), ms, &(x[0]));
Chris@42 663 }
Chris@42 664 {
Chris@42 665 V T3f, T3j, T3c, T3k, T38, T3b;
Chris@42 666 T3f = VSUB(T3d, T3e);
Chris@42 667 T3j = VSUB(T3h, T3i);
Chris@42 668 T38 = VSUB(T36, T37);
Chris@42 669 T3b = VSUB(T39, T3a);
Chris@42 670 T3c = VMUL(LDK(KP707106781), VSUB(T38, T3b));
Chris@42 671 T3k = VMUL(LDK(KP707106781), VADD(T38, T3b));
Chris@42 672 {
Chris@42 673 V T3g, T3l, T3m, T3n;
Chris@42 674 T3g = VBYI(VSUB(T3c, T3f));
Chris@42 675 T3l = VSUB(T3j, T3k);
Chris@42 676 ST(&(x[WS(rs, 12)]), VADD(T3g, T3l), ms, &(x[0]));
Chris@42 677 ST(&(x[WS(rs, 20)]), VSUB(T3l, T3g), ms, &(x[0]));
Chris@42 678 T3m = VBYI(VADD(T3f, T3c));
Chris@42 679 T3n = VADD(T3j, T3k);
Chris@42 680 ST(&(x[WS(rs, 4)]), VADD(T3m, T3n), ms, &(x[0]));
Chris@42 681 ST(&(x[WS(rs, 28)]), VSUB(T3n, T3m), ms, &(x[0]));
Chris@42 682 }
Chris@42 683 }
Chris@42 684 {
Chris@42 685 V T2L, T31, T2R, T2Y, T2A, T2Z, T2U, T32, T2H, T2Q;
Chris@42 686 T2H = VMUL(LDK(KP707106781), VSUB(T2D, T2G));
Chris@42 687 T2L = VSUB(T2H, T2K);
Chris@42 688 T31 = VADD(T2K, T2H);
Chris@42 689 T2Q = VMUL(LDK(KP707106781), VADD(T2D, T2G));
Chris@42 690 T2R = VSUB(T2P, T2Q);
Chris@42 691 T2Y = VADD(T2P, T2Q);
Chris@42 692 {
Chris@42 693 V T2s, T2z, T2S, T2T;
Chris@42 694 T2s = VFNMS(LDK(KP382683432), T2r, VMUL(LDK(KP923879532), T2o));
Chris@42 695 T2z = VFMA(LDK(KP923879532), T2v, VMUL(LDK(KP382683432), T2y));
Chris@42 696 T2A = VSUB(T2s, T2z);
Chris@42 697 T2Z = VADD(T2s, T2z);
Chris@42 698 T2S = VFMA(LDK(KP382683432), T2o, VMUL(LDK(KP923879532), T2r));
Chris@42 699 T2T = VFNMS(LDK(KP382683432), T2v, VMUL(LDK(KP923879532), T2y));
Chris@42 700 T2U = VSUB(T2S, T2T);
Chris@42 701 T32 = VADD(T2S, T2T);
Chris@42 702 }
Chris@42 703 {
Chris@42 704 V T2M, T2V, T34, T35;
Chris@42 705 T2M = VBYI(VSUB(T2A, T2L));
Chris@42 706 T2V = VSUB(T2R, T2U);
Chris@42 707 ST(&(x[WS(rs, 10)]), VADD(T2M, T2V), ms, &(x[0]));
Chris@42 708 ST(&(x[WS(rs, 22)]), VSUB(T2V, T2M), ms, &(x[0]));
Chris@42 709 T34 = VSUB(T2Y, T2Z);
Chris@42 710 T35 = VBYI(VSUB(T32, T31));
Chris@42 711 ST(&(x[WS(rs, 18)]), VSUB(T34, T35), ms, &(x[0]));
Chris@42 712 ST(&(x[WS(rs, 14)]), VADD(T34, T35), ms, &(x[0]));
Chris@42 713 }
Chris@42 714 {
Chris@42 715 V T2W, T2X, T30, T33;
Chris@42 716 T2W = VBYI(VADD(T2L, T2A));
Chris@42 717 T2X = VADD(T2R, T2U);
Chris@42 718 ST(&(x[WS(rs, 6)]), VADD(T2W, T2X), ms, &(x[0]));
Chris@42 719 ST(&(x[WS(rs, 26)]), VSUB(T2X, T2W), ms, &(x[0]));
Chris@42 720 T30 = VADD(T2Y, T2Z);
Chris@42 721 T33 = VBYI(VADD(T31, T32));
Chris@42 722 ST(&(x[WS(rs, 30)]), VSUB(T30, T33), ms, &(x[0]));
Chris@42 723 ST(&(x[WS(rs, 2)]), VADD(T30, T33), ms, &(x[0]));
Chris@42 724 }
Chris@42 725 }
Chris@42 726 {
Chris@42 727 V TE, T1P, T1I, T1Q, T1t, T1M, T1F, T1N;
Chris@42 728 {
Chris@42 729 V Tg, TD, T1G, T1H;
Chris@42 730 Tg = VSUB(T4, Tf);
Chris@42 731 TD = VSUB(Tr, TC);
Chris@42 732 TE = VSUB(Tg, TD);
Chris@42 733 T1P = VADD(Tg, TD);
Chris@42 734 T1G = VFNMS(LDK(KP555570233), TV, VMUL(LDK(KP831469612), T12));
Chris@42 735 T1H = VFMA(LDK(KP555570233), T1k, VMUL(LDK(KP831469612), T1r));
Chris@42 736 T1I = VSUB(T1G, T1H);
Chris@42 737 T1Q = VADD(T1G, T1H);
Chris@42 738 }
Chris@42 739 {
Chris@42 740 V T13, T1s, T1x, T1E;
Chris@42 741 T13 = VFMA(LDK(KP831469612), TV, VMUL(LDK(KP555570233), T12));
Chris@42 742 T1s = VFNMS(LDK(KP555570233), T1r, VMUL(LDK(KP831469612), T1k));
Chris@42 743 T1t = VSUB(T13, T1s);
Chris@42 744 T1M = VADD(T13, T1s);
Chris@42 745 T1x = VSUB(T1v, T1w);
Chris@42 746 T1E = VSUB(T1y, T1D);
Chris@42 747 T1F = VSUB(T1x, T1E);
Chris@42 748 T1N = VADD(T1E, T1x);
Chris@42 749 }
Chris@42 750 {
Chris@42 751 V T1u, T1J, T1S, T1T;
Chris@42 752 T1u = VADD(TE, T1t);
Chris@42 753 T1J = VBYI(VADD(T1F, T1I));
Chris@42 754 ST(&(x[WS(rs, 27)]), VSUB(T1u, T1J), ms, &(x[WS(rs, 1)]));
Chris@42 755 ST(&(x[WS(rs, 5)]), VADD(T1u, T1J), ms, &(x[WS(rs, 1)]));
Chris@42 756 T1S = VBYI(VADD(T1N, T1M));
Chris@42 757 T1T = VADD(T1P, T1Q);
Chris@42 758 ST(&(x[WS(rs, 3)]), VADD(T1S, T1T), ms, &(x[WS(rs, 1)]));
Chris@42 759 ST(&(x[WS(rs, 29)]), VSUB(T1T, T1S), ms, &(x[WS(rs, 1)]));
Chris@42 760 }
Chris@42 761 {
Chris@42 762 V T1K, T1L, T1O, T1R;
Chris@42 763 T1K = VSUB(TE, T1t);
Chris@42 764 T1L = VBYI(VSUB(T1I, T1F));
Chris@42 765 ST(&(x[WS(rs, 21)]), VSUB(T1K, T1L), ms, &(x[WS(rs, 1)]));
Chris@42 766 ST(&(x[WS(rs, 11)]), VADD(T1K, T1L), ms, &(x[WS(rs, 1)]));
Chris@42 767 T1O = VBYI(VSUB(T1M, T1N));
Chris@42 768 T1R = VSUB(T1P, T1Q);
Chris@42 769 ST(&(x[WS(rs, 13)]), VADD(T1O, T1R), ms, &(x[WS(rs, 1)]));
Chris@42 770 ST(&(x[WS(rs, 19)]), VSUB(T1R, T1O), ms, &(x[WS(rs, 1)]));
Chris@42 771 }
Chris@42 772 }
Chris@42 773 {
Chris@42 774 V T1W, T2h, T2a, T2i, T23, T2e, T27, T2f;
Chris@42 775 {
Chris@42 776 V T1U, T1V, T28, T29;
Chris@42 777 T1U = VADD(T4, Tf);
Chris@42 778 T1V = VADD(T1v, T1w);
Chris@42 779 T1W = VSUB(T1U, T1V);
Chris@42 780 T2h = VADD(T1U, T1V);
Chris@42 781 T28 = VFNMS(LDK(KP195090322), T1X, VMUL(LDK(KP980785280), T1Y));
Chris@42 782 T29 = VFMA(LDK(KP195090322), T20, VMUL(LDK(KP980785280), T21));
Chris@42 783 T2a = VSUB(T28, T29);
Chris@42 784 T2i = VADD(T28, T29);
Chris@42 785 }
Chris@42 786 {
Chris@42 787 V T1Z, T22, T25, T26;
Chris@42 788 T1Z = VFMA(LDK(KP980785280), T1X, VMUL(LDK(KP195090322), T1Y));
Chris@42 789 T22 = VFNMS(LDK(KP195090322), T21, VMUL(LDK(KP980785280), T20));
Chris@42 790 T23 = VSUB(T1Z, T22);
Chris@42 791 T2e = VADD(T1Z, T22);
Chris@42 792 T25 = VADD(Tr, TC);
Chris@42 793 T26 = VADD(T1D, T1y);
Chris@42 794 T27 = VSUB(T25, T26);
Chris@42 795 T2f = VADD(T26, T25);
Chris@42 796 }
Chris@42 797 {
Chris@42 798 V T24, T2b, T2k, T2l;
Chris@42 799 T24 = VADD(T1W, T23);
Chris@42 800 T2b = VBYI(VADD(T27, T2a));
Chris@42 801 ST(&(x[WS(rs, 25)]), VSUB(T24, T2b), ms, &(x[WS(rs, 1)]));
Chris@42 802 ST(&(x[WS(rs, 7)]), VADD(T24, T2b), ms, &(x[WS(rs, 1)]));
Chris@42 803 T2k = VBYI(VADD(T2f, T2e));
Chris@42 804 T2l = VADD(T2h, T2i);
Chris@42 805 ST(&(x[WS(rs, 1)]), VADD(T2k, T2l), ms, &(x[WS(rs, 1)]));
Chris@42 806 ST(&(x[WS(rs, 31)]), VSUB(T2l, T2k), ms, &(x[WS(rs, 1)]));
Chris@42 807 }
Chris@42 808 {
Chris@42 809 V T2c, T2d, T2g, T2j;
Chris@42 810 T2c = VSUB(T1W, T23);
Chris@42 811 T2d = VBYI(VSUB(T2a, T27));
Chris@42 812 ST(&(x[WS(rs, 23)]), VSUB(T2c, T2d), ms, &(x[WS(rs, 1)]));
Chris@42 813 ST(&(x[WS(rs, 9)]), VADD(T2c, T2d), ms, &(x[WS(rs, 1)]));
Chris@42 814 T2g = VBYI(VSUB(T2e, T2f));
Chris@42 815 T2j = VSUB(T2h, T2i);
Chris@42 816 ST(&(x[WS(rs, 15)]), VADD(T2g, T2j), ms, &(x[WS(rs, 1)]));
Chris@42 817 ST(&(x[WS(rs, 17)]), VSUB(T2j, T2g), ms, &(x[WS(rs, 1)]));
Chris@42 818 }
Chris@42 819 }
Chris@42 820 }
Chris@42 821 }
Chris@42 822 VLEAVE();
Chris@42 823 }
Chris@42 824
Chris@42 825 static const tw_instr twinstr[] = {
Chris@42 826 VTW(0, 1),
Chris@42 827 VTW(0, 2),
Chris@42 828 VTW(0, 3),
Chris@42 829 VTW(0, 4),
Chris@42 830 VTW(0, 5),
Chris@42 831 VTW(0, 6),
Chris@42 832 VTW(0, 7),
Chris@42 833 VTW(0, 8),
Chris@42 834 VTW(0, 9),
Chris@42 835 VTW(0, 10),
Chris@42 836 VTW(0, 11),
Chris@42 837 VTW(0, 12),
Chris@42 838 VTW(0, 13),
Chris@42 839 VTW(0, 14),
Chris@42 840 VTW(0, 15),
Chris@42 841 VTW(0, 16),
Chris@42 842 VTW(0, 17),
Chris@42 843 VTW(0, 18),
Chris@42 844 VTW(0, 19),
Chris@42 845 VTW(0, 20),
Chris@42 846 VTW(0, 21),
Chris@42 847 VTW(0, 22),
Chris@42 848 VTW(0, 23),
Chris@42 849 VTW(0, 24),
Chris@42 850 VTW(0, 25),
Chris@42 851 VTW(0, 26),
Chris@42 852 VTW(0, 27),
Chris@42 853 VTW(0, 28),
Chris@42 854 VTW(0, 29),
Chris@42 855 VTW(0, 30),
Chris@42 856 VTW(0, 31),
Chris@42 857 {TW_NEXT, VL, 0}
Chris@42 858 };
Chris@42 859
Chris@42 860 static const ct_desc desc = { 32, XSIMD_STRING("t2bv_32"), twinstr, &GENUS, {201, 88, 16, 0}, 0, 0, 0 };
Chris@42 861
Chris@42 862 void XSIMD(codelet_t2bv_32) (planner *p) {
Chris@42 863 X(kdft_dit_register) (p, t2bv_32, &desc);
Chris@42 864 }
Chris@42 865 #endif /* HAVE_FMA */