annotate src/fftw-3.3.3/dft/simd/common/t3fv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:38:50 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3fv_32 -include t3f.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 244 FP additions, 214 FP multiplications,
Chris@10 32 * (or, 146 additions, 116 multiplications, 98 fused multiply/add),
Chris@10 33 * 118 stack variables, 7 constants, and 64 memory accesses
Chris@10 34 */
Chris@10 35 #include "t3f.h"
Chris@10 36
Chris@10 37 static void t3fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 40 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 41 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@10 42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@10 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 44 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 46 {
Chris@10 47 INT m;
Chris@10 48 R *x;
Chris@10 49 x = ri;
Chris@10 50 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@10 51 V T2B, T2A, T2u, T2x, T2r, T2F, T2L, T2P;
Chris@10 52 {
Chris@10 53 V T2, T5, T3, T7;
Chris@10 54 T2 = LDW(&(W[0]));
Chris@10 55 T5 = LDW(&(W[TWVL * 4]));
Chris@10 56 T3 = LDW(&(W[TWVL * 2]));
Chris@10 57 T7 = LDW(&(W[TWVL * 6]));
Chris@10 58 {
Chris@10 59 V T24, Tb, T3x, T2T, T3K, T2W, T25, Tr, T3z, T3g, T28, TX, T3y, T3j, T27;
Chris@10 60 V TG, T37, T3F, T3G, T3a, T2Y, T15, T1p, T2Z, T2w, T1V, T2v, T1N, T32, T1h;
Chris@10 61 V T17, T1a;
Chris@10 62 {
Chris@10 63 V T1, Tz, TT, T4, TC, Tv, T12, T1D, T1w, T18, T1t, T1O, TK, TP, T1c;
Chris@10 64 V T1m, Tf, T6, Te, TL, TQ, T2S, Tp, TU, Ti, Ta, TM, TR, Tm, TJ;
Chris@10 65 V T22, T9, T1Z;
Chris@10 66 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 67 T22 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@10 68 T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@10 69 T1Z = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 70 {
Chris@10 71 V Tn, TH, Tk, To, Th, Tg, T8, Tl, T20, T23, TI;
Chris@10 72 {
Chris@10 73 V Td, T1C, Tc, T21;
Chris@10 74 Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 75 Tz = VZMUL(T2, T5);
Chris@10 76 T1C = VZMULJ(T2, T5);
Chris@10 77 Tn = VZMUL(T3, T5);
Chris@10 78 TT = VZMULJ(T3, T5);
Chris@10 79 Tc = VZMUL(T2, T3);
Chris@10 80 T4 = VZMULJ(T2, T3);
Chris@10 81 TH = VZMUL(T3, T7);
Chris@10 82 T21 = VZMULJ(T3, T7);
Chris@10 83 Tk = VZMUL(T2, T7);
Chris@10 84 TC = VZMULJ(T2, T7);
Chris@10 85 Tv = VZMULJ(T5, T7);
Chris@10 86 T12 = VZMULJ(Tz, T7);
Chris@10 87 T20 = VZMULJ(T1C, T1Z);
Chris@10 88 T1D = VZMULJ(T1C, T7);
Chris@10 89 T1w = VZMULJ(Tn, T7);
Chris@10 90 T18 = VZMULJ(TT, T7);
Chris@10 91 T1t = VZMUL(Tc, T7);
Chris@10 92 T1O = VZMULJ(Tc, T7);
Chris@10 93 TK = VZMUL(Tc, T5);
Chris@10 94 TP = VZMULJ(Tc, T5);
Chris@10 95 T1c = VZMUL(T4, T7);
Chris@10 96 T1m = VZMULJ(T4, T7);
Chris@10 97 Tf = VZMULJ(T4, T5);
Chris@10 98 T6 = VZMUL(T4, T5);
Chris@10 99 T23 = VZMULJ(T21, T22);
Chris@10 100 Te = VZMULJ(Tc, Td);
Chris@10 101 }
Chris@10 102 TL = VZMULJ(TK, T7);
Chris@10 103 TQ = VZMULJ(TP, T7);
Chris@10 104 To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@10 105 Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@10 106 Tg = VZMULJ(Tf, T7);
Chris@10 107 T8 = VZMULJ(T6, T7);
Chris@10 108 T2S = VADD(T20, T23);
Chris@10 109 T24 = VSUB(T20, T23);
Chris@10 110 Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@10 111 TI = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@10 112 Tp = VZMULJ(Tn, To);
Chris@10 113 TU = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 114 Ti = VZMULJ(Tg, Th);
Chris@10 115 Ta = VZMULJ(T8, T9);
Chris@10 116 TM = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@10 117 TR = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@10 118 Tm = VZMULJ(Tk, Tl);
Chris@10 119 TJ = VZMULJ(TH, TI);
Chris@10 120 }
Chris@10 121 {
Chris@10 122 V Tu, TE, Tw, TA;
Chris@10 123 {
Chris@10 124 V T3e, TO, T3f, TW;
Chris@10 125 {
Chris@10 126 V TV, T2U, Tj, T2R, TN, TS, T2V, Tq, Tt, TD;
Chris@10 127 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 128 TV = VZMULJ(TT, TU);
Chris@10 129 T2U = VADD(Te, Ti);
Chris@10 130 Tj = VSUB(Te, Ti);
Chris@10 131 T2R = VADD(T1, Ta);
Chris@10 132 Tb = VSUB(T1, Ta);
Chris@10 133 TN = VZMULJ(TL, TM);
Chris@10 134 TS = VZMULJ(TQ, TR);
Chris@10 135 T2V = VADD(Tm, Tp);
Chris@10 136 Tq = VSUB(Tm, Tp);
Chris@10 137 Tu = VZMULJ(T4, Tt);
Chris@10 138 TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@10 139 T3x = VSUB(T2R, T2S);
Chris@10 140 T2T = VADD(T2R, T2S);
Chris@10 141 T3e = VADD(TJ, TN);
Chris@10 142 TO = VSUB(TJ, TN);
Chris@10 143 T3f = VADD(TV, TS);
Chris@10 144 TW = VSUB(TS, TV);
Chris@10 145 T3K = VSUB(T2V, T2U);
Chris@10 146 T2W = VADD(T2U, T2V);
Chris@10 147 T25 = VSUB(Tq, Tj);
Chris@10 148 Tr = VADD(Tj, Tq);
Chris@10 149 TE = VZMULJ(TC, TD);
Chris@10 150 }
Chris@10 151 Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@10 152 T3z = VSUB(T3e, T3f);
Chris@10 153 T3g = VADD(T3e, T3f);
Chris@10 154 T28 = VFMA(LDK(KP414213562), TO, TW);
Chris@10 155 TX = VFNMS(LDK(KP414213562), TW, TO);
Chris@10 156 TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 157 }
Chris@10 158 {
Chris@10 159 V T35, T1z, T1T, T36, T39, T1L, T1B, T1F;
Chris@10 160 {
Chris@10 161 V T1v, T1y, Ty, T3h, T1S, T1Q, T1I, T3i, TF, T1K, T1A, T1E;
Chris@10 162 {
Chris@10 163 V T1u, T1x, Tx, T1R;
Chris@10 164 T1u = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@10 165 T1x = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@10 166 Tx = VZMULJ(Tv, Tw);
Chris@10 167 T1R = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 168 {
Chris@10 169 V T1P, T1H, T1J, TB;
Chris@10 170 T1P = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@10 171 T1H = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@10 172 T1J = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 173 TB = VZMULJ(Tz, TA);
Chris@10 174 T1v = VZMULJ(T1t, T1u);
Chris@10 175 T1y = VZMULJ(T1w, T1x);
Chris@10 176 Ty = VSUB(Tu, Tx);
Chris@10 177 T3h = VADD(Tu, Tx);
Chris@10 178 T1S = VZMULJ(Tf, T1R);
Chris@10 179 T1Q = VZMULJ(T1O, T1P);
Chris@10 180 T1I = VZMULJ(T7, T1H);
Chris@10 181 T3i = VADD(TB, TE);
Chris@10 182 TF = VSUB(TB, TE);
Chris@10 183 T1K = VZMULJ(T6, T1J);
Chris@10 184 T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 185 T1E = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@10 186 }
Chris@10 187 }
Chris@10 188 T35 = VADD(T1v, T1y);
Chris@10 189 T1z = VSUB(T1v, T1y);
Chris@10 190 T1T = VSUB(T1Q, T1S);
Chris@10 191 T36 = VADD(T1S, T1Q);
Chris@10 192 T3y = VSUB(T3h, T3i);
Chris@10 193 T3j = VADD(T3h, T3i);
Chris@10 194 T27 = VFMA(LDK(KP414213562), Ty, TF);
Chris@10 195 TG = VFNMS(LDK(KP414213562), TF, Ty);
Chris@10 196 T39 = VADD(T1I, T1K);
Chris@10 197 T1L = VSUB(T1I, T1K);
Chris@10 198 T1B = VZMULJ(T3, T1A);
Chris@10 199 T1F = VZMULJ(T1D, T1E);
Chris@10 200 }
Chris@10 201 {
Chris@10 202 V T11, T14, T1o, T1l, T1e, T1U, T1M, T1g, T16, T19;
Chris@10 203 {
Chris@10 204 V T10, T13, T1n, T1k;
Chris@10 205 T10 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 206 T13 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@10 207 T1n = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@10 208 T1k = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 209 {
Chris@10 210 V T1d, T1f, T1G, T38;
Chris@10 211 T1d = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@10 212 T1f = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@10 213 T1G = VSUB(T1B, T1F);
Chris@10 214 T38 = VADD(T1B, T1F);
Chris@10 215 T37 = VADD(T35, T36);
Chris@10 216 T3F = VSUB(T35, T36);
Chris@10 217 T11 = VZMULJ(T2, T10);
Chris@10 218 T14 = VZMULJ(T12, T13);
Chris@10 219 T1o = VZMULJ(T1m, T1n);
Chris@10 220 T1l = VZMULJ(T5, T1k);
Chris@10 221 T1e = VZMULJ(T1c, T1d);
Chris@10 222 T3G = VSUB(T39, T38);
Chris@10 223 T3a = VADD(T38, T39);
Chris@10 224 T1U = VSUB(T1L, T1G);
Chris@10 225 T1M = VADD(T1G, T1L);
Chris@10 226 T1g = VZMULJ(TK, T1f);
Chris@10 227 }
Chris@10 228 T16 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 229 T19 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@10 230 }
Chris@10 231 T2Y = VADD(T11, T14);
Chris@10 232 T15 = VSUB(T11, T14);
Chris@10 233 T1p = VSUB(T1l, T1o);
Chris@10 234 T2Z = VADD(T1l, T1o);
Chris@10 235 T2w = VFNMS(LDK(KP707106781), T1U, T1T);
Chris@10 236 T1V = VFMA(LDK(KP707106781), T1U, T1T);
Chris@10 237 T2v = VFNMS(LDK(KP707106781), T1M, T1z);
Chris@10 238 T1N = VFMA(LDK(KP707106781), T1M, T1z);
Chris@10 239 T32 = VADD(T1e, T1g);
Chris@10 240 T1h = VSUB(T1e, T1g);
Chris@10 241 T17 = VZMULJ(TP, T16);
Chris@10 242 T1a = VZMULJ(T18, T19);
Chris@10 243 }
Chris@10 244 }
Chris@10 245 }
Chris@10 246 }
Chris@10 247 {
Chris@10 248 V T2X, T3k, T3b, T3t, T1b, T31, T30, T3C, T3r, T3v, T3p, T3q;
Chris@10 249 T2X = VSUB(T2T, T2W);
Chris@10 250 T3p = VADD(T2T, T2W);
Chris@10 251 T3q = VADD(T3j, T3g);
Chris@10 252 T3k = VSUB(T3g, T3j);
Chris@10 253 T3b = VSUB(T37, T3a);
Chris@10 254 T3t = VADD(T37, T3a);
Chris@10 255 T1b = VSUB(T17, T1a);
Chris@10 256 T31 = VADD(T17, T1a);
Chris@10 257 T30 = VADD(T2Y, T2Z);
Chris@10 258 T3C = VSUB(T2Y, T2Z);
Chris@10 259 T3r = VADD(T3p, T3q);
Chris@10 260 T3v = VSUB(T3p, T3q);
Chris@10 261 {
Chris@10 262 V T3N, T3B, T3T, T3M, T3W, T3O, T2t, T1r, T2s, T1j, T3I, T3X, T3c, T3l, T3u;
Chris@10 263 V T3w;
Chris@10 264 {
Chris@10 265 V T3L, T3A, T33, T3D, T1i, T1q;
Chris@10 266 T3L = VSUB(T3z, T3y);
Chris@10 267 T3A = VADD(T3y, T3z);
Chris@10 268 T33 = VADD(T31, T32);
Chris@10 269 T3D = VSUB(T31, T32);
Chris@10 270 T1i = VADD(T1b, T1h);
Chris@10 271 T1q = VSUB(T1b, T1h);
Chris@10 272 {
Chris@10 273 V T3H, T3E, T34, T3s;
Chris@10 274 T3N = VFMA(LDK(KP414213562), T3F, T3G);
Chris@10 275 T3H = VFNMS(LDK(KP414213562), T3G, T3F);
Chris@10 276 T3B = VFMA(LDK(KP707106781), T3A, T3x);
Chris@10 277 T3T = VFNMS(LDK(KP707106781), T3A, T3x);
Chris@10 278 T3M = VFMA(LDK(KP707106781), T3L, T3K);
Chris@10 279 T3W = VFNMS(LDK(KP707106781), T3L, T3K);
Chris@10 280 T3O = VFMA(LDK(KP414213562), T3C, T3D);
Chris@10 281 T3E = VFNMS(LDK(KP414213562), T3D, T3C);
Chris@10 282 T34 = VSUB(T30, T33);
Chris@10 283 T3s = VADD(T30, T33);
Chris@10 284 T2t = VFNMS(LDK(KP707106781), T1q, T1p);
Chris@10 285 T1r = VFMA(LDK(KP707106781), T1q, T1p);
Chris@10 286 T2s = VFNMS(LDK(KP707106781), T1i, T15);
Chris@10 287 T1j = VFMA(LDK(KP707106781), T1i, T15);
Chris@10 288 T3I = VADD(T3E, T3H);
Chris@10 289 T3X = VSUB(T3H, T3E);
Chris@10 290 T3c = VADD(T34, T3b);
Chris@10 291 T3l = VSUB(T3b, T34);
Chris@10 292 T3u = VADD(T3s, T3t);
Chris@10 293 T3w = VSUB(T3t, T3s);
Chris@10 294 }
Chris@10 295 }
Chris@10 296 {
Chris@10 297 V T2p, Ts, TY, T1s, T2b, T2c, T1W, T26, T29, T2q, T3U, T3P, T2J, T2K;
Chris@10 298 T2p = VFNMS(LDK(KP707106781), Tr, Tb);
Chris@10 299 Ts = VFMA(LDK(KP707106781), Tr, Tb);
Chris@10 300 T3U = VADD(T3O, T3N);
Chris@10 301 T3P = VSUB(T3N, T3O);
Chris@10 302 {
Chris@10 303 V T3Y, T40, T3R, T3J;
Chris@10 304 T3Y = VFMA(LDK(KP923879532), T3X, T3W);
Chris@10 305 T40 = VFNMS(LDK(KP923879532), T3X, T3W);
Chris@10 306 T3R = VFMA(LDK(KP923879532), T3I, T3B);
Chris@10 307 T3J = VFNMS(LDK(KP923879532), T3I, T3B);
Chris@10 308 {
Chris@10 309 V T3o, T3m, T3n, T3d;
Chris@10 310 T3o = VFMA(LDK(KP707106781), T3l, T3k);
Chris@10 311 T3m = VFNMS(LDK(KP707106781), T3l, T3k);
Chris@10 312 T3n = VFMA(LDK(KP707106781), T3c, T2X);
Chris@10 313 T3d = VFNMS(LDK(KP707106781), T3c, T2X);
Chris@10 314 ST(&(x[WS(rs, 24)]), VFNMSI(T3w, T3v), ms, &(x[0]));
Chris@10 315 ST(&(x[WS(rs, 8)]), VFMAI(T3w, T3v), ms, &(x[0]));
Chris@10 316 ST(&(x[0]), VADD(T3r, T3u), ms, &(x[0]));
Chris@10 317 ST(&(x[WS(rs, 16)]), VSUB(T3r, T3u), ms, &(x[0]));
Chris@10 318 {
Chris@10 319 V T3V, T3Z, T3S, T3Q;
Chris@10 320 T3V = VFNMS(LDK(KP923879532), T3U, T3T);
Chris@10 321 T3Z = VFMA(LDK(KP923879532), T3U, T3T);
Chris@10 322 T3S = VFMA(LDK(KP923879532), T3P, T3M);
Chris@10 323 T3Q = VFNMS(LDK(KP923879532), T3P, T3M);
Chris@10 324 ST(&(x[WS(rs, 4)]), VFMAI(T3o, T3n), ms, &(x[0]));
Chris@10 325 ST(&(x[WS(rs, 28)]), VFNMSI(T3o, T3n), ms, &(x[0]));
Chris@10 326 ST(&(x[WS(rs, 20)]), VFMAI(T3m, T3d), ms, &(x[0]));
Chris@10 327 ST(&(x[WS(rs, 12)]), VFNMSI(T3m, T3d), ms, &(x[0]));
Chris@10 328 ST(&(x[WS(rs, 22)]), VFNMSI(T3Y, T3V), ms, &(x[0]));
Chris@10 329 ST(&(x[WS(rs, 10)]), VFMAI(T3Y, T3V), ms, &(x[0]));
Chris@10 330 ST(&(x[WS(rs, 26)]), VFMAI(T40, T3Z), ms, &(x[0]));
Chris@10 331 ST(&(x[WS(rs, 6)]), VFNMSI(T40, T3Z), ms, &(x[0]));
Chris@10 332 ST(&(x[WS(rs, 2)]), VFMAI(T3S, T3R), ms, &(x[0]));
Chris@10 333 ST(&(x[WS(rs, 30)]), VFNMSI(T3S, T3R), ms, &(x[0]));
Chris@10 334 ST(&(x[WS(rs, 18)]), VFMAI(T3Q, T3J), ms, &(x[0]));
Chris@10 335 ST(&(x[WS(rs, 14)]), VFNMSI(T3Q, T3J), ms, &(x[0]));
Chris@10 336 TY = VADD(TG, TX);
Chris@10 337 T2B = VSUB(TX, TG);
Chris@10 338 }
Chris@10 339 }
Chris@10 340 }
Chris@10 341 T1s = VFNMS(LDK(KP198912367), T1r, T1j);
Chris@10 342 T2b = VFMA(LDK(KP198912367), T1j, T1r);
Chris@10 343 T2c = VFMA(LDK(KP198912367), T1N, T1V);
Chris@10 344 T1W = VFNMS(LDK(KP198912367), T1V, T1N);
Chris@10 345 T2A = VFMA(LDK(KP707106781), T25, T24);
Chris@10 346 T26 = VFNMS(LDK(KP707106781), T25, T24);
Chris@10 347 T29 = VSUB(T27, T28);
Chris@10 348 T2q = VADD(T27, T28);
Chris@10 349 {
Chris@10 350 V T2j, T2n, T1Y, T2f, T2o, T2m, T2e, T2g;
Chris@10 351 {
Chris@10 352 V T2h, TZ, T2i, T2d, T2l, T1X, T2k, T2a, T2D, T2E;
Chris@10 353 T2h = VFNMS(LDK(KP923879532), TY, Ts);
Chris@10 354 TZ = VFMA(LDK(KP923879532), TY, Ts);
Chris@10 355 T2i = VADD(T2b, T2c);
Chris@10 356 T2d = VSUB(T2b, T2c);
Chris@10 357 T2l = VSUB(T1W, T1s);
Chris@10 358 T1X = VADD(T1s, T1W);
Chris@10 359 T2k = VFNMS(LDK(KP923879532), T29, T26);
Chris@10 360 T2a = VFMA(LDK(KP923879532), T29, T26);
Chris@10 361 T2u = VFMA(LDK(KP668178637), T2t, T2s);
Chris@10 362 T2D = VFNMS(LDK(KP668178637), T2s, T2t);
Chris@10 363 T2j = VFNMS(LDK(KP980785280), T2i, T2h);
Chris@10 364 T2n = VFMA(LDK(KP980785280), T2i, T2h);
Chris@10 365 T2E = VFNMS(LDK(KP668178637), T2v, T2w);
Chris@10 366 T2x = VFMA(LDK(KP668178637), T2w, T2v);
Chris@10 367 T1Y = VFNMS(LDK(KP980785280), T1X, TZ);
Chris@10 368 T2f = VFMA(LDK(KP980785280), T1X, TZ);
Chris@10 369 T2o = VFMA(LDK(KP980785280), T2l, T2k);
Chris@10 370 T2m = VFNMS(LDK(KP980785280), T2l, T2k);
Chris@10 371 T2e = VFNMS(LDK(KP980785280), T2d, T2a);
Chris@10 372 T2g = VFMA(LDK(KP980785280), T2d, T2a);
Chris@10 373 T2r = VFMA(LDK(KP923879532), T2q, T2p);
Chris@10 374 T2J = VFNMS(LDK(KP923879532), T2q, T2p);
Chris@10 375 T2K = VADD(T2D, T2E);
Chris@10 376 T2F = VSUB(T2D, T2E);
Chris@10 377 }
Chris@10 378 ST(&(x[WS(rs, 23)]), VFMAI(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@10 379 ST(&(x[WS(rs, 9)]), VFNMSI(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@10 380 ST(&(x[WS(rs, 25)]), VFNMSI(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@10 381 ST(&(x[WS(rs, 7)]), VFMAI(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@10 382 ST(&(x[WS(rs, 31)]), VFMAI(T2g, T2f), ms, &(x[WS(rs, 1)]));
Chris@10 383 ST(&(x[WS(rs, 1)]), VFNMSI(T2g, T2f), ms, &(x[WS(rs, 1)]));
Chris@10 384 ST(&(x[WS(rs, 15)]), VFMAI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
Chris@10 385 ST(&(x[WS(rs, 17)]), VFNMSI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
Chris@10 386 }
Chris@10 387 T2L = VFMA(LDK(KP831469612), T2K, T2J);
Chris@10 388 T2P = VFNMS(LDK(KP831469612), T2K, T2J);
Chris@10 389 }
Chris@10 390 }
Chris@10 391 }
Chris@10 392 }
Chris@10 393 }
Chris@10 394 {
Chris@10 395 V T2y, T2N, T2C, T2M;
Chris@10 396 T2y = VADD(T2u, T2x);
Chris@10 397 T2N = VSUB(T2x, T2u);
Chris@10 398 T2C = VFMA(LDK(KP923879532), T2B, T2A);
Chris@10 399 T2M = VFNMS(LDK(KP923879532), T2B, T2A);
Chris@10 400 {
Chris@10 401 V T2z, T2H, T2Q, T2O, T2G, T2I;
Chris@10 402 T2z = VFNMS(LDK(KP831469612), T2y, T2r);
Chris@10 403 T2H = VFMA(LDK(KP831469612), T2y, T2r);
Chris@10 404 T2Q = VFNMS(LDK(KP831469612), T2N, T2M);
Chris@10 405 T2O = VFMA(LDK(KP831469612), T2N, T2M);
Chris@10 406 T2G = VFNMS(LDK(KP831469612), T2F, T2C);
Chris@10 407 T2I = VFMA(LDK(KP831469612), T2F, T2C);
Chris@10 408 ST(&(x[WS(rs, 21)]), VFNMSI(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@10 409 ST(&(x[WS(rs, 11)]), VFMAI(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@10 410 ST(&(x[WS(rs, 27)]), VFMAI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@10 411 ST(&(x[WS(rs, 5)]), VFNMSI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@10 412 ST(&(x[WS(rs, 3)]), VFMAI(T2I, T2H), ms, &(x[WS(rs, 1)]));
Chris@10 413 ST(&(x[WS(rs, 29)]), VFNMSI(T2I, T2H), ms, &(x[WS(rs, 1)]));
Chris@10 414 ST(&(x[WS(rs, 19)]), VFMAI(T2G, T2z), ms, &(x[WS(rs, 1)]));
Chris@10 415 ST(&(x[WS(rs, 13)]), VFNMSI(T2G, T2z), ms, &(x[WS(rs, 1)]));
Chris@10 416 }
Chris@10 417 }
Chris@10 418 }
Chris@10 419 }
Chris@10 420 VLEAVE();
Chris@10 421 }
Chris@10 422
Chris@10 423 static const tw_instr twinstr[] = {
Chris@10 424 VTW(0, 1),
Chris@10 425 VTW(0, 3),
Chris@10 426 VTW(0, 9),
Chris@10 427 VTW(0, 27),
Chris@10 428 {TW_NEXT, VL, 0}
Chris@10 429 };
Chris@10 430
Chris@10 431 static const ct_desc desc = { 32, XSIMD_STRING("t3fv_32"), twinstr, &GENUS, {146, 116, 98, 0}, 0, 0, 0 };
Chris@10 432
Chris@10 433 void XSIMD(codelet_t3fv_32) (planner *p) {
Chris@10 434 X(kdft_dit_register) (p, t3fv_32, &desc);
Chris@10 435 }
Chris@10 436 #else /* HAVE_FMA */
Chris@10 437
Chris@10 438 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3fv_32 -include t3f.h */
Chris@10 439
Chris@10 440 /*
Chris@10 441 * This function contains 244 FP additions, 158 FP multiplications,
Chris@10 442 * (or, 228 additions, 142 multiplications, 16 fused multiply/add),
Chris@10 443 * 90 stack variables, 7 constants, and 64 memory accesses
Chris@10 444 */
Chris@10 445 #include "t3f.h"
Chris@10 446
Chris@10 447 static void t3fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 448 {
Chris@10 449 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@10 450 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 451 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@10 452 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 453 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 454 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 455 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 456 {
Chris@10 457 INT m;
Chris@10 458 R *x;
Chris@10 459 x = ri;
Chris@10 460 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@10 461 V T2, T5, T3, T4, Tc, T1C, TP, Tz, Tn, T6, TS, Tf, TK, T7, T8;
Chris@10 462 V Tv, T1w, T22, Tg, Tk, T1D, T1R, TC, T18, T12, T1t, TH, TL, TT, T1n;
Chris@10 463 V T1c;
Chris@10 464 T2 = LDW(&(W[0]));
Chris@10 465 T5 = LDW(&(W[TWVL * 4]));
Chris@10 466 T3 = LDW(&(W[TWVL * 2]));
Chris@10 467 T4 = VZMULJ(T2, T3);
Chris@10 468 Tc = VZMUL(T2, T3);
Chris@10 469 T1C = VZMULJ(T2, T5);
Chris@10 470 TP = VZMULJ(T3, T5);
Chris@10 471 Tz = VZMUL(T2, T5);
Chris@10 472 Tn = VZMUL(T3, T5);
Chris@10 473 T6 = VZMUL(T4, T5);
Chris@10 474 TS = VZMULJ(Tc, T5);
Chris@10 475 Tf = VZMULJ(T4, T5);
Chris@10 476 TK = VZMUL(Tc, T5);
Chris@10 477 T7 = LDW(&(W[TWVL * 6]));
Chris@10 478 T8 = VZMULJ(T6, T7);
Chris@10 479 Tv = VZMULJ(T5, T7);
Chris@10 480 T1w = VZMULJ(Tn, T7);
Chris@10 481 T22 = VZMULJ(T3, T7);
Chris@10 482 Tg = VZMULJ(Tf, T7);
Chris@10 483 Tk = VZMUL(T2, T7);
Chris@10 484 T1D = VZMULJ(T1C, T7);
Chris@10 485 T1R = VZMULJ(Tc, T7);
Chris@10 486 TC = VZMULJ(T2, T7);
Chris@10 487 T18 = VZMULJ(TP, T7);
Chris@10 488 T12 = VZMULJ(Tz, T7);
Chris@10 489 T1t = VZMUL(Tc, T7);
Chris@10 490 TH = VZMUL(T3, T7);
Chris@10 491 TL = VZMULJ(TK, T7);
Chris@10 492 TT = VZMULJ(TS, T7);
Chris@10 493 T1n = VZMULJ(T4, T7);
Chris@10 494 T1c = VZMUL(T4, T7);
Chris@10 495 {
Chris@10 496 V Tb, T25, T2T, T3x, Tr, T1Z, T2W, T3K, TX, T27, T3g, T3z, TG, T28, T3j;
Chris@10 497 V T3y, T1N, T2v, T3a, T3G, T1V, T2w, T37, T3F, T1j, T2s, T33, T3D, T1r, T2t;
Chris@10 498 V T30, T3C;
Chris@10 499 {
Chris@10 500 V T1, T24, Ta, T21, T23, T9, T20, T2R, T2S;
Chris@10 501 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 502 T23 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@10 503 T24 = VZMULJ(T22, T23);
Chris@10 504 T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@10 505 Ta = VZMULJ(T8, T9);
Chris@10 506 T20 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 507 T21 = VZMULJ(T1C, T20);
Chris@10 508 Tb = VSUB(T1, Ta);
Chris@10 509 T25 = VSUB(T21, T24);
Chris@10 510 T2R = VADD(T1, Ta);
Chris@10 511 T2S = VADD(T21, T24);
Chris@10 512 T2T = VADD(T2R, T2S);
Chris@10 513 T3x = VSUB(T2R, T2S);
Chris@10 514 }
Chris@10 515 {
Chris@10 516 V Te, Tp, Ti, Tm;
Chris@10 517 {
Chris@10 518 V Td, To, Th, Tl;
Chris@10 519 Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 520 Te = VZMULJ(Tc, Td);
Chris@10 521 To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@10 522 Tp = VZMULJ(Tn, To);
Chris@10 523 Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@10 524 Ti = VZMULJ(Tg, Th);
Chris@10 525 Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@10 526 Tm = VZMULJ(Tk, Tl);
Chris@10 527 }
Chris@10 528 {
Chris@10 529 V Tj, Tq, T2U, T2V;
Chris@10 530 Tj = VSUB(Te, Ti);
Chris@10 531 Tq = VSUB(Tm, Tp);
Chris@10 532 Tr = VMUL(LDK(KP707106781), VADD(Tj, Tq));
Chris@10 533 T1Z = VMUL(LDK(KP707106781), VSUB(Tq, Tj));
Chris@10 534 T2U = VADD(Te, Ti);
Chris@10 535 T2V = VADD(Tm, Tp);
Chris@10 536 T2W = VADD(T2U, T2V);
Chris@10 537 T3K = VSUB(T2V, T2U);
Chris@10 538 }
Chris@10 539 }
Chris@10 540 {
Chris@10 541 V TJ, TV, TN, TR;
Chris@10 542 {
Chris@10 543 V TI, TU, TM, TQ;
Chris@10 544 TI = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@10 545 TJ = VZMULJ(TH, TI);
Chris@10 546 TU = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@10 547 TV = VZMULJ(TT, TU);
Chris@10 548 TM = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@10 549 TN = VZMULJ(TL, TM);
Chris@10 550 TQ = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 551 TR = VZMULJ(TP, TQ);
Chris@10 552 }
Chris@10 553 {
Chris@10 554 V TO, TW, T3e, T3f;
Chris@10 555 TO = VSUB(TJ, TN);
Chris@10 556 TW = VSUB(TR, TV);
Chris@10 557 TX = VFMA(LDK(KP923879532), TO, VMUL(LDK(KP382683432), TW));
Chris@10 558 T27 = VFNMS(LDK(KP923879532), TW, VMUL(LDK(KP382683432), TO));
Chris@10 559 T3e = VADD(TJ, TN);
Chris@10 560 T3f = VADD(TR, TV);
Chris@10 561 T3g = VADD(T3e, T3f);
Chris@10 562 T3z = VSUB(T3e, T3f);
Chris@10 563 }
Chris@10 564 }
Chris@10 565 {
Chris@10 566 V Tu, TE, Tx, TB;
Chris@10 567 {
Chris@10 568 V Tt, TD, Tw, TA;
Chris@10 569 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 570 Tu = VZMULJ(T4, Tt);
Chris@10 571 TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@10 572 TE = VZMULJ(TC, TD);
Chris@10 573 Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@10 574 Tx = VZMULJ(Tv, Tw);
Chris@10 575 TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 576 TB = VZMULJ(Tz, TA);
Chris@10 577 }
Chris@10 578 {
Chris@10 579 V Ty, TF, T3h, T3i;
Chris@10 580 Ty = VSUB(Tu, Tx);
Chris@10 581 TF = VSUB(TB, TE);
Chris@10 582 TG = VFNMS(LDK(KP382683432), TF, VMUL(LDK(KP923879532), Ty));
Chris@10 583 T28 = VFMA(LDK(KP382683432), Ty, VMUL(LDK(KP923879532), TF));
Chris@10 584 T3h = VADD(Tu, Tx);
Chris@10 585 T3i = VADD(TB, TE);
Chris@10 586 T3j = VADD(T3h, T3i);
Chris@10 587 T3y = VSUB(T3h, T3i);
Chris@10 588 }
Chris@10 589 }
Chris@10 590 {
Chris@10 591 V T1v, T1y, T1T, T1Q, T1I, T1K, T1L, T1B, T1F, T1G;
Chris@10 592 {
Chris@10 593 V T1u, T1x, T1S, T1P;
Chris@10 594 T1u = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@10 595 T1v = VZMULJ(T1t, T1u);
Chris@10 596 T1x = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@10 597 T1y = VZMULJ(T1w, T1x);
Chris@10 598 T1S = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@10 599 T1T = VZMULJ(T1R, T1S);
Chris@10 600 T1P = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 601 T1Q = VZMULJ(Tf, T1P);
Chris@10 602 {
Chris@10 603 V T1H, T1J, T1A, T1E;
Chris@10 604 T1H = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@10 605 T1I = VZMULJ(T7, T1H);
Chris@10 606 T1J = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 607 T1K = VZMULJ(T6, T1J);
Chris@10 608 T1L = VSUB(T1I, T1K);
Chris@10 609 T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 610 T1B = VZMULJ(T3, T1A);
Chris@10 611 T1E = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@10 612 T1F = VZMULJ(T1D, T1E);
Chris@10 613 T1G = VSUB(T1B, T1F);
Chris@10 614 }
Chris@10 615 }
Chris@10 616 {
Chris@10 617 V T1z, T1M, T38, T39;
Chris@10 618 T1z = VSUB(T1v, T1y);
Chris@10 619 T1M = VMUL(LDK(KP707106781), VADD(T1G, T1L));
Chris@10 620 T1N = VADD(T1z, T1M);
Chris@10 621 T2v = VSUB(T1z, T1M);
Chris@10 622 T38 = VADD(T1B, T1F);
Chris@10 623 T39 = VADD(T1I, T1K);
Chris@10 624 T3a = VADD(T38, T39);
Chris@10 625 T3G = VSUB(T39, T38);
Chris@10 626 }
Chris@10 627 {
Chris@10 628 V T1O, T1U, T35, T36;
Chris@10 629 T1O = VMUL(LDK(KP707106781), VSUB(T1L, T1G));
Chris@10 630 T1U = VSUB(T1Q, T1T);
Chris@10 631 T1V = VSUB(T1O, T1U);
Chris@10 632 T2w = VADD(T1U, T1O);
Chris@10 633 T35 = VADD(T1v, T1y);
Chris@10 634 T36 = VADD(T1Q, T1T);
Chris@10 635 T37 = VADD(T35, T36);
Chris@10 636 T3F = VSUB(T35, T36);
Chris@10 637 }
Chris@10 638 }
Chris@10 639 {
Chris@10 640 V T11, T14, T1p, T1m, T1e, T1g, T1h, T17, T1a, T1b;
Chris@10 641 {
Chris@10 642 V T10, T13, T1o, T1l;
Chris@10 643 T10 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 644 T11 = VZMULJ(T2, T10);
Chris@10 645 T13 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@10 646 T14 = VZMULJ(T12, T13);
Chris@10 647 T1o = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@10 648 T1p = VZMULJ(T1n, T1o);
Chris@10 649 T1l = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 650 T1m = VZMULJ(T5, T1l);
Chris@10 651 {
Chris@10 652 V T1d, T1f, T16, T19;
Chris@10 653 T1d = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@10 654 T1e = VZMULJ(T1c, T1d);
Chris@10 655 T1f = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@10 656 T1g = VZMULJ(TK, T1f);
Chris@10 657 T1h = VSUB(T1e, T1g);
Chris@10 658 T16 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 659 T17 = VZMULJ(TS, T16);
Chris@10 660 T19 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@10 661 T1a = VZMULJ(T18, T19);
Chris@10 662 T1b = VSUB(T17, T1a);
Chris@10 663 }
Chris@10 664 }
Chris@10 665 {
Chris@10 666 V T15, T1i, T31, T32;
Chris@10 667 T15 = VSUB(T11, T14);
Chris@10 668 T1i = VMUL(LDK(KP707106781), VADD(T1b, T1h));
Chris@10 669 T1j = VADD(T15, T1i);
Chris@10 670 T2s = VSUB(T15, T1i);
Chris@10 671 T31 = VADD(T17, T1a);
Chris@10 672 T32 = VADD(T1e, T1g);
Chris@10 673 T33 = VADD(T31, T32);
Chris@10 674 T3D = VSUB(T32, T31);
Chris@10 675 }
Chris@10 676 {
Chris@10 677 V T1k, T1q, T2Y, T2Z;
Chris@10 678 T1k = VMUL(LDK(KP707106781), VSUB(T1h, T1b));
Chris@10 679 T1q = VSUB(T1m, T1p);
Chris@10 680 T1r = VSUB(T1k, T1q);
Chris@10 681 T2t = VADD(T1q, T1k);
Chris@10 682 T2Y = VADD(T11, T14);
Chris@10 683 T2Z = VADD(T1m, T1p);
Chris@10 684 T30 = VADD(T2Y, T2Z);
Chris@10 685 T3C = VSUB(T2Y, T2Z);
Chris@10 686 }
Chris@10 687 }
Chris@10 688 {
Chris@10 689 V T3r, T3v, T3u, T3w;
Chris@10 690 {
Chris@10 691 V T3p, T3q, T3s, T3t;
Chris@10 692 T3p = VADD(T2T, T2W);
Chris@10 693 T3q = VADD(T3j, T3g);
Chris@10 694 T3r = VADD(T3p, T3q);
Chris@10 695 T3v = VSUB(T3p, T3q);
Chris@10 696 T3s = VADD(T30, T33);
Chris@10 697 T3t = VADD(T37, T3a);
Chris@10 698 T3u = VADD(T3s, T3t);
Chris@10 699 T3w = VBYI(VSUB(T3t, T3s));
Chris@10 700 }
Chris@10 701 ST(&(x[WS(rs, 16)]), VSUB(T3r, T3u), ms, &(x[0]));
Chris@10 702 ST(&(x[WS(rs, 8)]), VADD(T3v, T3w), ms, &(x[0]));
Chris@10 703 ST(&(x[0]), VADD(T3r, T3u), ms, &(x[0]));
Chris@10 704 ST(&(x[WS(rs, 24)]), VSUB(T3v, T3w), ms, &(x[0]));
Chris@10 705 }
Chris@10 706 {
Chris@10 707 V T2X, T3k, T3c, T3l, T34, T3b;
Chris@10 708 T2X = VSUB(T2T, T2W);
Chris@10 709 T3k = VSUB(T3g, T3j);
Chris@10 710 T34 = VSUB(T30, T33);
Chris@10 711 T3b = VSUB(T37, T3a);
Chris@10 712 T3c = VMUL(LDK(KP707106781), VADD(T34, T3b));
Chris@10 713 T3l = VMUL(LDK(KP707106781), VSUB(T3b, T34));
Chris@10 714 {
Chris@10 715 V T3d, T3m, T3n, T3o;
Chris@10 716 T3d = VADD(T2X, T3c);
Chris@10 717 T3m = VBYI(VADD(T3k, T3l));
Chris@10 718 ST(&(x[WS(rs, 28)]), VSUB(T3d, T3m), ms, &(x[0]));
Chris@10 719 ST(&(x[WS(rs, 4)]), VADD(T3d, T3m), ms, &(x[0]));
Chris@10 720 T3n = VSUB(T2X, T3c);
Chris@10 721 T3o = VBYI(VSUB(T3l, T3k));
Chris@10 722 ST(&(x[WS(rs, 20)]), VSUB(T3n, T3o), ms, &(x[0]));
Chris@10 723 ST(&(x[WS(rs, 12)]), VADD(T3n, T3o), ms, &(x[0]));
Chris@10 724 }
Chris@10 725 }
Chris@10 726 {
Chris@10 727 V T3B, T3W, T3M, T3U, T3I, T3T, T3P, T3X, T3A, T3L;
Chris@10 728 T3A = VMUL(LDK(KP707106781), VADD(T3y, T3z));
Chris@10 729 T3B = VADD(T3x, T3A);
Chris@10 730 T3W = VSUB(T3x, T3A);
Chris@10 731 T3L = VMUL(LDK(KP707106781), VSUB(T3z, T3y));
Chris@10 732 T3M = VADD(T3K, T3L);
Chris@10 733 T3U = VSUB(T3L, T3K);
Chris@10 734 {
Chris@10 735 V T3E, T3H, T3N, T3O;
Chris@10 736 T3E = VFMA(LDK(KP923879532), T3C, VMUL(LDK(KP382683432), T3D));
Chris@10 737 T3H = VFNMS(LDK(KP382683432), T3G, VMUL(LDK(KP923879532), T3F));
Chris@10 738 T3I = VADD(T3E, T3H);
Chris@10 739 T3T = VSUB(T3H, T3E);
Chris@10 740 T3N = VFNMS(LDK(KP382683432), T3C, VMUL(LDK(KP923879532), T3D));
Chris@10 741 T3O = VFMA(LDK(KP382683432), T3F, VMUL(LDK(KP923879532), T3G));
Chris@10 742 T3P = VADD(T3N, T3O);
Chris@10 743 T3X = VSUB(T3O, T3N);
Chris@10 744 }
Chris@10 745 {
Chris@10 746 V T3J, T3Q, T3Z, T40;
Chris@10 747 T3J = VADD(T3B, T3I);
Chris@10 748 T3Q = VBYI(VADD(T3M, T3P));
Chris@10 749 ST(&(x[WS(rs, 30)]), VSUB(T3J, T3Q), ms, &(x[0]));
Chris@10 750 ST(&(x[WS(rs, 2)]), VADD(T3J, T3Q), ms, &(x[0]));
Chris@10 751 T3Z = VBYI(VADD(T3U, T3T));
Chris@10 752 T40 = VADD(T3W, T3X);
Chris@10 753 ST(&(x[WS(rs, 6)]), VADD(T3Z, T40), ms, &(x[0]));
Chris@10 754 ST(&(x[WS(rs, 26)]), VSUB(T40, T3Z), ms, &(x[0]));
Chris@10 755 }
Chris@10 756 {
Chris@10 757 V T3R, T3S, T3V, T3Y;
Chris@10 758 T3R = VSUB(T3B, T3I);
Chris@10 759 T3S = VBYI(VSUB(T3P, T3M));
Chris@10 760 ST(&(x[WS(rs, 18)]), VSUB(T3R, T3S), ms, &(x[0]));
Chris@10 761 ST(&(x[WS(rs, 14)]), VADD(T3R, T3S), ms, &(x[0]));
Chris@10 762 T3V = VBYI(VSUB(T3T, T3U));
Chris@10 763 T3Y = VSUB(T3W, T3X);
Chris@10 764 ST(&(x[WS(rs, 10)]), VADD(T3V, T3Y), ms, &(x[0]));
Chris@10 765 ST(&(x[WS(rs, 22)]), VSUB(T3Y, T3V), ms, &(x[0]));
Chris@10 766 }
Chris@10 767 }
Chris@10 768 {
Chris@10 769 V TZ, T2k, T2d, T2l, T1X, T2h, T2a, T2i;
Chris@10 770 {
Chris@10 771 V Ts, TY, T2b, T2c;
Chris@10 772 Ts = VADD(Tb, Tr);
Chris@10 773 TY = VADD(TG, TX);
Chris@10 774 TZ = VADD(Ts, TY);
Chris@10 775 T2k = VSUB(Ts, TY);
Chris@10 776 T2b = VFNMS(LDK(KP195090322), T1j, VMUL(LDK(KP980785280), T1r));
Chris@10 777 T2c = VFMA(LDK(KP195090322), T1N, VMUL(LDK(KP980785280), T1V));
Chris@10 778 T2d = VADD(T2b, T2c);
Chris@10 779 T2l = VSUB(T2c, T2b);
Chris@10 780 }
Chris@10 781 {
Chris@10 782 V T1s, T1W, T26, T29;
Chris@10 783 T1s = VFMA(LDK(KP980785280), T1j, VMUL(LDK(KP195090322), T1r));
Chris@10 784 T1W = VFNMS(LDK(KP195090322), T1V, VMUL(LDK(KP980785280), T1N));
Chris@10 785 T1X = VADD(T1s, T1W);
Chris@10 786 T2h = VSUB(T1W, T1s);
Chris@10 787 T26 = VSUB(T1Z, T25);
Chris@10 788 T29 = VSUB(T27, T28);
Chris@10 789 T2a = VADD(T26, T29);
Chris@10 790 T2i = VSUB(T29, T26);
Chris@10 791 }
Chris@10 792 {
Chris@10 793 V T1Y, T2e, T2n, T2o;
Chris@10 794 T1Y = VADD(TZ, T1X);
Chris@10 795 T2e = VBYI(VADD(T2a, T2d));
Chris@10 796 ST(&(x[WS(rs, 31)]), VSUB(T1Y, T2e), ms, &(x[WS(rs, 1)]));
Chris@10 797 ST(&(x[WS(rs, 1)]), VADD(T1Y, T2e), ms, &(x[WS(rs, 1)]));
Chris@10 798 T2n = VBYI(VADD(T2i, T2h));
Chris@10 799 T2o = VADD(T2k, T2l);
Chris@10 800 ST(&(x[WS(rs, 7)]), VADD(T2n, T2o), ms, &(x[WS(rs, 1)]));
Chris@10 801 ST(&(x[WS(rs, 25)]), VSUB(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@10 802 }
Chris@10 803 {
Chris@10 804 V T2f, T2g, T2j, T2m;
Chris@10 805 T2f = VSUB(TZ, T1X);
Chris@10 806 T2g = VBYI(VSUB(T2d, T2a));
Chris@10 807 ST(&(x[WS(rs, 17)]), VSUB(T2f, T2g), ms, &(x[WS(rs, 1)]));
Chris@10 808 ST(&(x[WS(rs, 15)]), VADD(T2f, T2g), ms, &(x[WS(rs, 1)]));
Chris@10 809 T2j = VBYI(VSUB(T2h, T2i));
Chris@10 810 T2m = VSUB(T2k, T2l);
Chris@10 811 ST(&(x[WS(rs, 9)]), VADD(T2j, T2m), ms, &(x[WS(rs, 1)]));
Chris@10 812 ST(&(x[WS(rs, 23)]), VSUB(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@10 813 }
Chris@10 814 }
Chris@10 815 {
Chris@10 816 V T2r, T2M, T2F, T2N, T2y, T2J, T2C, T2K;
Chris@10 817 {
Chris@10 818 V T2p, T2q, T2D, T2E;
Chris@10 819 T2p = VSUB(Tb, Tr);
Chris@10 820 T2q = VADD(T28, T27);
Chris@10 821 T2r = VADD(T2p, T2q);
Chris@10 822 T2M = VSUB(T2p, T2q);
Chris@10 823 T2D = VFNMS(LDK(KP555570233), T2s, VMUL(LDK(KP831469612), T2t));
Chris@10 824 T2E = VFMA(LDK(KP555570233), T2v, VMUL(LDK(KP831469612), T2w));
Chris@10 825 T2F = VADD(T2D, T2E);
Chris@10 826 T2N = VSUB(T2E, T2D);
Chris@10 827 }
Chris@10 828 {
Chris@10 829 V T2u, T2x, T2A, T2B;
Chris@10 830 T2u = VFMA(LDK(KP831469612), T2s, VMUL(LDK(KP555570233), T2t));
Chris@10 831 T2x = VFNMS(LDK(KP555570233), T2w, VMUL(LDK(KP831469612), T2v));
Chris@10 832 T2y = VADD(T2u, T2x);
Chris@10 833 T2J = VSUB(T2x, T2u);
Chris@10 834 T2A = VADD(T25, T1Z);
Chris@10 835 T2B = VSUB(TX, TG);
Chris@10 836 T2C = VADD(T2A, T2B);
Chris@10 837 T2K = VSUB(T2B, T2A);
Chris@10 838 }
Chris@10 839 {
Chris@10 840 V T2z, T2G, T2P, T2Q;
Chris@10 841 T2z = VADD(T2r, T2y);
Chris@10 842 T2G = VBYI(VADD(T2C, T2F));
Chris@10 843 ST(&(x[WS(rs, 29)]), VSUB(T2z, T2G), ms, &(x[WS(rs, 1)]));
Chris@10 844 ST(&(x[WS(rs, 3)]), VADD(T2z, T2G), ms, &(x[WS(rs, 1)]));
Chris@10 845 T2P = VBYI(VADD(T2K, T2J));
Chris@10 846 T2Q = VADD(T2M, T2N);
Chris@10 847 ST(&(x[WS(rs, 5)]), VADD(T2P, T2Q), ms, &(x[WS(rs, 1)]));
Chris@10 848 ST(&(x[WS(rs, 27)]), VSUB(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@10 849 }
Chris@10 850 {
Chris@10 851 V T2H, T2I, T2L, T2O;
Chris@10 852 T2H = VSUB(T2r, T2y);
Chris@10 853 T2I = VBYI(VSUB(T2F, T2C));
Chris@10 854 ST(&(x[WS(rs, 19)]), VSUB(T2H, T2I), ms, &(x[WS(rs, 1)]));
Chris@10 855 ST(&(x[WS(rs, 13)]), VADD(T2H, T2I), ms, &(x[WS(rs, 1)]));
Chris@10 856 T2L = VBYI(VSUB(T2J, T2K));
Chris@10 857 T2O = VSUB(T2M, T2N);
Chris@10 858 ST(&(x[WS(rs, 11)]), VADD(T2L, T2O), ms, &(x[WS(rs, 1)]));
Chris@10 859 ST(&(x[WS(rs, 21)]), VSUB(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@10 860 }
Chris@10 861 }
Chris@10 862 }
Chris@10 863 }
Chris@10 864 }
Chris@10 865 VLEAVE();
Chris@10 866 }
Chris@10 867
Chris@10 868 static const tw_instr twinstr[] = {
Chris@10 869 VTW(0, 1),
Chris@10 870 VTW(0, 3),
Chris@10 871 VTW(0, 9),
Chris@10 872 VTW(0, 27),
Chris@10 873 {TW_NEXT, VL, 0}
Chris@10 874 };
Chris@10 875
Chris@10 876 static const ct_desc desc = { 32, XSIMD_STRING("t3fv_32"), twinstr, &GENUS, {228, 142, 16, 0}, 0, 0, 0 };
Chris@10 877
Chris@10 878 void XSIMD(codelet_t3fv_32) (planner *p) {
Chris@10 879 X(kdft_dit_register) (p, t3fv_32, &desc);
Chris@10 880 }
Chris@10 881 #endif /* HAVE_FMA */