annotate src/fftw-3.3.5/dft/simd/common/t3bv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:44:49 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3bv_32 -include t3b.h -sign 1 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 244 FP additions, 214 FP multiplications,
Chris@42 32 * (or, 146 additions, 116 multiplications, 98 fused multiply/add),
Chris@42 33 * 120 stack variables, 7 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "t3b.h"
Chris@42 36
Chris@42 37 static void t3bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 40 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 41 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 44 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 46 {
Chris@42 47 INT m;
Chris@42 48 R *x;
Chris@42 49 x = ii;
Chris@42 50 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 51 V T2B, T2A, T2F, T2N, T2H, T2z, T2P, T2L, T2C, T2M;
Chris@42 52 {
Chris@42 53 V T2, T5, T3, T7;
Chris@42 54 T2 = LDW(&(W[0]));
Chris@42 55 T5 = LDW(&(W[TWVL * 4]));
Chris@42 56 T3 = LDW(&(W[TWVL * 2]));
Chris@42 57 T7 = LDW(&(W[TWVL * 6]));
Chris@42 58 {
Chris@42 59 V T24, Tb, T3x, T2T, T3K, T2W, T25, Tr, T3z, T3j, T28, TX, T3y, T3g, T27;
Chris@42 60 V TG, T37, T3F, T3G, T3a, T2Y, T15, T1p, T2Z, T2w, T1V, T2v, T1N, T32, T1h;
Chris@42 61 V T17, T1a;
Chris@42 62 {
Chris@42 63 V T1, Tz, TT, T4, TC, Tv, T12, T1D, T1w, T18, T1t, T1O, TK, TP, T1c;
Chris@42 64 V T1m, Tf, T6, Te, TL, TQ, T2S, Tp, TU, Ti, Ta, TM, TR, Tm, TJ;
Chris@42 65 V T22, T9, T1Z;
Chris@42 66 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 67 T22 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@42 68 T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@42 69 T1Z = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 70 {
Chris@42 71 V Tn, TH, Tk, To, Th, Tg, T8, Tl, T20, T23, TI;
Chris@42 72 {
Chris@42 73 V Td, T1C, Tc, T21;
Chris@42 74 Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 75 Tz = VZMUL(T2, T5);
Chris@42 76 T1C = VZMULJ(T2, T5);
Chris@42 77 Tn = VZMUL(T3, T5);
Chris@42 78 TT = VZMULJ(T3, T5);
Chris@42 79 Tc = VZMUL(T2, T3);
Chris@42 80 T4 = VZMULJ(T2, T3);
Chris@42 81 TH = VZMUL(T3, T7);
Chris@42 82 T21 = VZMULJ(T3, T7);
Chris@42 83 Tk = VZMUL(T2, T7);
Chris@42 84 TC = VZMULJ(T2, T7);
Chris@42 85 Tv = VZMULJ(T5, T7);
Chris@42 86 T12 = VZMULJ(Tz, T7);
Chris@42 87 T20 = VZMUL(T1C, T1Z);
Chris@42 88 T1D = VZMULJ(T1C, T7);
Chris@42 89 T1w = VZMULJ(Tn, T7);
Chris@42 90 T18 = VZMULJ(TT, T7);
Chris@42 91 T1t = VZMUL(Tc, T7);
Chris@42 92 T1O = VZMULJ(Tc, T7);
Chris@42 93 TK = VZMUL(Tc, T5);
Chris@42 94 TP = VZMULJ(Tc, T5);
Chris@42 95 T1c = VZMUL(T4, T7);
Chris@42 96 T1m = VZMULJ(T4, T7);
Chris@42 97 Tf = VZMULJ(T4, T5);
Chris@42 98 T6 = VZMUL(T4, T5);
Chris@42 99 T23 = VZMUL(T21, T22);
Chris@42 100 Te = VZMUL(Tc, Td);
Chris@42 101 }
Chris@42 102 TL = VZMULJ(TK, T7);
Chris@42 103 TQ = VZMULJ(TP, T7);
Chris@42 104 To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 105 Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@42 106 Tg = VZMULJ(Tf, T7);
Chris@42 107 T8 = VZMULJ(T6, T7);
Chris@42 108 T2S = VADD(T20, T23);
Chris@42 109 T24 = VSUB(T20, T23);
Chris@42 110 Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@42 111 TI = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@42 112 Tp = VZMUL(Tn, To);
Chris@42 113 TU = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 114 Ti = VZMUL(Tg, Th);
Chris@42 115 Ta = VZMUL(T8, T9);
Chris@42 116 TM = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 117 TR = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@42 118 Tm = VZMUL(Tk, Tl);
Chris@42 119 TJ = VZMUL(TH, TI);
Chris@42 120 }
Chris@42 121 {
Chris@42 122 V Tu, TE, Tw, TA;
Chris@42 123 {
Chris@42 124 V T3h, TO, T3i, TW;
Chris@42 125 {
Chris@42 126 V TV, T2U, Tj, T2R, TN, TS, T2V, Tq, Tt, TD;
Chris@42 127 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 128 TV = VZMUL(TT, TU);
Chris@42 129 T2U = VADD(Te, Ti);
Chris@42 130 Tj = VSUB(Te, Ti);
Chris@42 131 T2R = VADD(T1, Ta);
Chris@42 132 Tb = VSUB(T1, Ta);
Chris@42 133 TN = VZMUL(TL, TM);
Chris@42 134 TS = VZMUL(TQ, TR);
Chris@42 135 T2V = VADD(Tm, Tp);
Chris@42 136 Tq = VSUB(Tm, Tp);
Chris@42 137 Tu = VZMUL(T4, Tt);
Chris@42 138 TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@42 139 T3x = VSUB(T2R, T2S);
Chris@42 140 T2T = VADD(T2R, T2S);
Chris@42 141 T3h = VADD(TJ, TN);
Chris@42 142 TO = VSUB(TJ, TN);
Chris@42 143 T3i = VADD(TV, TS);
Chris@42 144 TW = VSUB(TS, TV);
Chris@42 145 T3K = VSUB(T2U, T2V);
Chris@42 146 T2W = VADD(T2U, T2V);
Chris@42 147 T25 = VSUB(Tj, Tq);
Chris@42 148 Tr = VADD(Tj, Tq);
Chris@42 149 TE = VZMUL(TC, TD);
Chris@42 150 }
Chris@42 151 Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@42 152 T3z = VSUB(T3h, T3i);
Chris@42 153 T3j = VADD(T3h, T3i);
Chris@42 154 T28 = VFMA(LDK(KP414213562), TO, TW);
Chris@42 155 TX = VFNMS(LDK(KP414213562), TW, TO);
Chris@42 156 TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 157 }
Chris@42 158 {
Chris@42 159 V T35, T1z, T1T, T36, T39, T1L, T1B, T1F;
Chris@42 160 {
Chris@42 161 V T1v, T1y, Ty, T3e, T1S, T1Q, T1I, T3f, TF, T1K, T1A, T1E;
Chris@42 162 {
Chris@42 163 V T1u, T1x, Tx, T1R;
Chris@42 164 T1u = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@42 165 T1x = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 166 Tx = VZMUL(Tv, Tw);
Chris@42 167 T1R = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 168 {
Chris@42 169 V T1P, T1H, T1J, TB;
Chris@42 170 T1P = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@42 171 T1H = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@42 172 T1J = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 173 TB = VZMUL(Tz, TA);
Chris@42 174 T1v = VZMUL(T1t, T1u);
Chris@42 175 T1y = VZMUL(T1w, T1x);
Chris@42 176 Ty = VSUB(Tu, Tx);
Chris@42 177 T3e = VADD(Tu, Tx);
Chris@42 178 T1S = VZMUL(Tf, T1R);
Chris@42 179 T1Q = VZMUL(T1O, T1P);
Chris@42 180 T1I = VZMUL(T7, T1H);
Chris@42 181 T3f = VADD(TB, TE);
Chris@42 182 TF = VSUB(TB, TE);
Chris@42 183 T1K = VZMUL(T6, T1J);
Chris@42 184 T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 185 T1E = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@42 186 }
Chris@42 187 }
Chris@42 188 T35 = VADD(T1v, T1y);
Chris@42 189 T1z = VSUB(T1v, T1y);
Chris@42 190 T1T = VSUB(T1Q, T1S);
Chris@42 191 T36 = VADD(T1S, T1Q);
Chris@42 192 T3y = VSUB(T3e, T3f);
Chris@42 193 T3g = VADD(T3e, T3f);
Chris@42 194 T27 = VFMA(LDK(KP414213562), Ty, TF);
Chris@42 195 TG = VFNMS(LDK(KP414213562), TF, Ty);
Chris@42 196 T39 = VADD(T1I, T1K);
Chris@42 197 T1L = VSUB(T1I, T1K);
Chris@42 198 T1B = VZMUL(T3, T1A);
Chris@42 199 T1F = VZMUL(T1D, T1E);
Chris@42 200 }
Chris@42 201 {
Chris@42 202 V T11, T14, T1o, T1l, T1e, T1U, T1M, T1g, T16, T19;
Chris@42 203 {
Chris@42 204 V T10, T13, T1n, T1k;
Chris@42 205 T10 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 206 T13 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@42 207 T1n = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@42 208 T1k = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 209 {
Chris@42 210 V T1d, T1f, T1G, T38;
Chris@42 211 T1d = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@42 212 T1f = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 213 T1G = VSUB(T1B, T1F);
Chris@42 214 T38 = VADD(T1B, T1F);
Chris@42 215 T37 = VADD(T35, T36);
Chris@42 216 T3F = VSUB(T35, T36);
Chris@42 217 T11 = VZMUL(T2, T10);
Chris@42 218 T14 = VZMUL(T12, T13);
Chris@42 219 T1o = VZMUL(T1m, T1n);
Chris@42 220 T1l = VZMUL(T5, T1k);
Chris@42 221 T1e = VZMUL(T1c, T1d);
Chris@42 222 T3G = VSUB(T39, T38);
Chris@42 223 T3a = VADD(T38, T39);
Chris@42 224 T1U = VSUB(T1L, T1G);
Chris@42 225 T1M = VADD(T1G, T1L);
Chris@42 226 T1g = VZMUL(TK, T1f);
Chris@42 227 }
Chris@42 228 T16 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 229 T19 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@42 230 }
Chris@42 231 T2Y = VADD(T11, T14);
Chris@42 232 T15 = VSUB(T11, T14);
Chris@42 233 T1p = VSUB(T1l, T1o);
Chris@42 234 T2Z = VADD(T1l, T1o);
Chris@42 235 T2w = VFNMS(LDK(KP707106781), T1U, T1T);
Chris@42 236 T1V = VFMA(LDK(KP707106781), T1U, T1T);
Chris@42 237 T2v = VFNMS(LDK(KP707106781), T1M, T1z);
Chris@42 238 T1N = VFMA(LDK(KP707106781), T1M, T1z);
Chris@42 239 T32 = VADD(T1e, T1g);
Chris@42 240 T1h = VSUB(T1e, T1g);
Chris@42 241 T17 = VZMUL(TP, T16);
Chris@42 242 T1a = VZMUL(T18, T19);
Chris@42 243 }
Chris@42 244 }
Chris@42 245 }
Chris@42 246 }
Chris@42 247 {
Chris@42 248 V T2X, T3k, T3b, T3t, T1b, T31, T30, T3C, T3r, T3v, T3p, T3q;
Chris@42 249 T2X = VSUB(T2T, T2W);
Chris@42 250 T3p = VADD(T2T, T2W);
Chris@42 251 T3q = VADD(T3g, T3j);
Chris@42 252 T3k = VSUB(T3g, T3j);
Chris@42 253 T3b = VSUB(T37, T3a);
Chris@42 254 T3t = VADD(T37, T3a);
Chris@42 255 T1b = VSUB(T17, T1a);
Chris@42 256 T31 = VADD(T17, T1a);
Chris@42 257 T30 = VADD(T2Y, T2Z);
Chris@42 258 T3C = VSUB(T2Y, T2Z);
Chris@42 259 T3r = VSUB(T3p, T3q);
Chris@42 260 T3v = VADD(T3p, T3q);
Chris@42 261 {
Chris@42 262 V T1r, T2t, T1j, T2s, T3S, T3Y, T3R, T3V;
Chris@42 263 {
Chris@42 264 V T3B, T3T, T3M, T3W, T3U, T3P, T3X, T3I, T3l, T3c, T3w, T3u;
Chris@42 265 {
Chris@42 266 V T3L, T3A, T33, T3D, T1i, T1q, T3O, T3H;
Chris@42 267 T3L = VSUB(T3y, T3z);
Chris@42 268 T3A = VADD(T3y, T3z);
Chris@42 269 T33 = VADD(T31, T32);
Chris@42 270 T3D = VSUB(T31, T32);
Chris@42 271 T1i = VADD(T1b, T1h);
Chris@42 272 T1q = VSUB(T1b, T1h);
Chris@42 273 T3O = VFMA(LDK(KP414213562), T3F, T3G);
Chris@42 274 T3H = VFNMS(LDK(KP414213562), T3G, T3F);
Chris@42 275 T3B = VFMA(LDK(KP707106781), T3A, T3x);
Chris@42 276 T3T = VFNMS(LDK(KP707106781), T3A, T3x);
Chris@42 277 T3M = VFMA(LDK(KP707106781), T3L, T3K);
Chris@42 278 T3W = VFNMS(LDK(KP707106781), T3L, T3K);
Chris@42 279 {
Chris@42 280 V T3E, T3N, T3s, T34;
Chris@42 281 T3E = VFNMS(LDK(KP414213562), T3D, T3C);
Chris@42 282 T3N = VFMA(LDK(KP414213562), T3C, T3D);
Chris@42 283 T3s = VADD(T30, T33);
Chris@42 284 T34 = VSUB(T30, T33);
Chris@42 285 T1r = VFMA(LDK(KP707106781), T1q, T1p);
Chris@42 286 T2t = VFNMS(LDK(KP707106781), T1q, T1p);
Chris@42 287 T1j = VFMA(LDK(KP707106781), T1i, T15);
Chris@42 288 T2s = VFNMS(LDK(KP707106781), T1i, T15);
Chris@42 289 T3U = VADD(T3N, T3O);
Chris@42 290 T3P = VSUB(T3N, T3O);
Chris@42 291 T3X = VSUB(T3E, T3H);
Chris@42 292 T3I = VADD(T3E, T3H);
Chris@42 293 T3l = VSUB(T34, T3b);
Chris@42 294 T3c = VADD(T34, T3b);
Chris@42 295 T3w = VADD(T3s, T3t);
Chris@42 296 T3u = VSUB(T3s, T3t);
Chris@42 297 }
Chris@42 298 }
Chris@42 299 {
Chris@42 300 V T40, T3Z, T3Q, T3J;
Chris@42 301 T3S = VFMA(LDK(KP923879532), T3P, T3M);
Chris@42 302 T3Q = VFNMS(LDK(KP923879532), T3P, T3M);
Chris@42 303 T40 = VFNMS(LDK(KP923879532), T3X, T3W);
Chris@42 304 T3Y = VFMA(LDK(KP923879532), T3X, T3W);
Chris@42 305 T3R = VFMA(LDK(KP923879532), T3I, T3B);
Chris@42 306 T3J = VFNMS(LDK(KP923879532), T3I, T3B);
Chris@42 307 {
Chris@42 308 V T3o, T3m, T3n, T3d;
Chris@42 309 T3o = VFMA(LDK(KP707106781), T3l, T3k);
Chris@42 310 T3m = VFNMS(LDK(KP707106781), T3l, T3k);
Chris@42 311 T3n = VFMA(LDK(KP707106781), T3c, T2X);
Chris@42 312 T3d = VFNMS(LDK(KP707106781), T3c, T2X);
Chris@42 313 ST(&(x[WS(rs, 16)]), VSUB(T3v, T3w), ms, &(x[0]));
Chris@42 314 ST(&(x[0]), VADD(T3v, T3w), ms, &(x[0]));
Chris@42 315 ST(&(x[WS(rs, 8)]), VFMAI(T3u, T3r), ms, &(x[0]));
Chris@42 316 ST(&(x[WS(rs, 24)]), VFNMSI(T3u, T3r), ms, &(x[0]));
Chris@42 317 T3Z = VFMA(LDK(KP923879532), T3U, T3T);
Chris@42 318 T3V = VFNMS(LDK(KP923879532), T3U, T3T);
Chris@42 319 ST(&(x[WS(rs, 18)]), VFMAI(T3Q, T3J), ms, &(x[0]));
Chris@42 320 ST(&(x[WS(rs, 14)]), VFNMSI(T3Q, T3J), ms, &(x[0]));
Chris@42 321 ST(&(x[WS(rs, 28)]), VFNMSI(T3o, T3n), ms, &(x[0]));
Chris@42 322 ST(&(x[WS(rs, 4)]), VFMAI(T3o, T3n), ms, &(x[0]));
Chris@42 323 ST(&(x[WS(rs, 20)]), VFMAI(T3m, T3d), ms, &(x[0]));
Chris@42 324 ST(&(x[WS(rs, 12)]), VFNMSI(T3m, T3d), ms, &(x[0]));
Chris@42 325 }
Chris@42 326 ST(&(x[WS(rs, 26)]), VFMAI(T40, T3Z), ms, &(x[0]));
Chris@42 327 ST(&(x[WS(rs, 6)]), VFNMSI(T40, T3Z), ms, &(x[0]));
Chris@42 328 }
Chris@42 329 }
Chris@42 330 {
Chris@42 331 V T2p, T1s, T1W, T2h, TZ, T2i, T2d, T26, T29, T2q;
Chris@42 332 {
Chris@42 333 V Ts, TY, T2b, T2c;
Chris@42 334 T2p = VFNMS(LDK(KP707106781), Tr, Tb);
Chris@42 335 Ts = VFMA(LDK(KP707106781), Tr, Tb);
Chris@42 336 TY = VADD(TG, TX);
Chris@42 337 T2B = VSUB(TG, TX);
Chris@42 338 T1s = VFNMS(LDK(KP198912367), T1r, T1j);
Chris@42 339 T2b = VFMA(LDK(KP198912367), T1j, T1r);
Chris@42 340 T2c = VFMA(LDK(KP198912367), T1N, T1V);
Chris@42 341 T1W = VFNMS(LDK(KP198912367), T1V, T1N);
Chris@42 342 ST(&(x[WS(rs, 2)]), VFMAI(T3S, T3R), ms, &(x[0]));
Chris@42 343 ST(&(x[WS(rs, 30)]), VFNMSI(T3S, T3R), ms, &(x[0]));
Chris@42 344 ST(&(x[WS(rs, 22)]), VFNMSI(T3Y, T3V), ms, &(x[0]));
Chris@42 345 ST(&(x[WS(rs, 10)]), VFMAI(T3Y, T3V), ms, &(x[0]));
Chris@42 346 T2h = VFNMS(LDK(KP923879532), TY, Ts);
Chris@42 347 TZ = VFMA(LDK(KP923879532), TY, Ts);
Chris@42 348 T2i = VADD(T2b, T2c);
Chris@42 349 T2d = VSUB(T2b, T2c);
Chris@42 350 T2A = VFNMS(LDK(KP707106781), T25, T24);
Chris@42 351 T26 = VFMA(LDK(KP707106781), T25, T24);
Chris@42 352 T29 = VSUB(T27, T28);
Chris@42 353 T2q = VADD(T27, T28);
Chris@42 354 }
Chris@42 355 {
Chris@42 356 V T2J, T2r, T2K, T2y;
Chris@42 357 {
Chris@42 358 V T2u, T2D, T2j, T2n, T2l, T1X, T2k, T2a, T2E, T2x;
Chris@42 359 T2u = VFMA(LDK(KP668178637), T2t, T2s);
Chris@42 360 T2D = VFNMS(LDK(KP668178637), T2s, T2t);
Chris@42 361 T2j = VFNMS(LDK(KP980785280), T2i, T2h);
Chris@42 362 T2n = VFMA(LDK(KP980785280), T2i, T2h);
Chris@42 363 T2l = VSUB(T1s, T1W);
Chris@42 364 T1X = VADD(T1s, T1W);
Chris@42 365 T2k = VFNMS(LDK(KP923879532), T29, T26);
Chris@42 366 T2a = VFMA(LDK(KP923879532), T29, T26);
Chris@42 367 T2J = VFNMS(LDK(KP923879532), T2q, T2p);
Chris@42 368 T2r = VFMA(LDK(KP923879532), T2q, T2p);
Chris@42 369 T2E = VFNMS(LDK(KP668178637), T2v, T2w);
Chris@42 370 T2x = VFMA(LDK(KP668178637), T2w, T2v);
Chris@42 371 {
Chris@42 372 V T1Y, T2f, T2o, T2m, T2e, T2g;
Chris@42 373 T1Y = VFNMS(LDK(KP980785280), T1X, TZ);
Chris@42 374 T2f = VFMA(LDK(KP980785280), T1X, TZ);
Chris@42 375 T2o = VFNMS(LDK(KP980785280), T2l, T2k);
Chris@42 376 T2m = VFMA(LDK(KP980785280), T2l, T2k);
Chris@42 377 T2e = VFNMS(LDK(KP980785280), T2d, T2a);
Chris@42 378 T2g = VFMA(LDK(KP980785280), T2d, T2a);
Chris@42 379 T2F = VSUB(T2D, T2E);
Chris@42 380 T2K = VADD(T2D, T2E);
Chris@42 381 T2N = VSUB(T2u, T2x);
Chris@42 382 T2y = VADD(T2u, T2x);
Chris@42 383 ST(&(x[WS(rs, 23)]), VFNMSI(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@42 384 ST(&(x[WS(rs, 9)]), VFMAI(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@42 385 ST(&(x[WS(rs, 25)]), VFMAI(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@42 386 ST(&(x[WS(rs, 7)]), VFNMSI(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@42 387 ST(&(x[WS(rs, 1)]), VFMAI(T2g, T2f), ms, &(x[WS(rs, 1)]));
Chris@42 388 ST(&(x[WS(rs, 31)]), VFNMSI(T2g, T2f), ms, &(x[WS(rs, 1)]));
Chris@42 389 ST(&(x[WS(rs, 17)]), VFMAI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
Chris@42 390 ST(&(x[WS(rs, 15)]), VFNMSI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
Chris@42 391 }
Chris@42 392 }
Chris@42 393 T2H = VFMA(LDK(KP831469612), T2y, T2r);
Chris@42 394 T2z = VFNMS(LDK(KP831469612), T2y, T2r);
Chris@42 395 T2P = VFNMS(LDK(KP831469612), T2K, T2J);
Chris@42 396 T2L = VFMA(LDK(KP831469612), T2K, T2J);
Chris@42 397 }
Chris@42 398 }
Chris@42 399 }
Chris@42 400 }
Chris@42 401 }
Chris@42 402 }
Chris@42 403 T2C = VFNMS(LDK(KP923879532), T2B, T2A);
Chris@42 404 T2M = VFMA(LDK(KP923879532), T2B, T2A);
Chris@42 405 {
Chris@42 406 V T2Q, T2O, T2G, T2I;
Chris@42 407 T2Q = VFMA(LDK(KP831469612), T2N, T2M);
Chris@42 408 T2O = VFNMS(LDK(KP831469612), T2N, T2M);
Chris@42 409 T2G = VFNMS(LDK(KP831469612), T2F, T2C);
Chris@42 410 T2I = VFMA(LDK(KP831469612), T2F, T2C);
Chris@42 411 ST(&(x[WS(rs, 21)]), VFMAI(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@42 412 ST(&(x[WS(rs, 11)]), VFNMSI(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@42 413 ST(&(x[WS(rs, 27)]), VFNMSI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@42 414 ST(&(x[WS(rs, 5)]), VFMAI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@42 415 ST(&(x[WS(rs, 29)]), VFMAI(T2I, T2H), ms, &(x[WS(rs, 1)]));
Chris@42 416 ST(&(x[WS(rs, 3)]), VFNMSI(T2I, T2H), ms, &(x[WS(rs, 1)]));
Chris@42 417 ST(&(x[WS(rs, 13)]), VFMAI(T2G, T2z), ms, &(x[WS(rs, 1)]));
Chris@42 418 ST(&(x[WS(rs, 19)]), VFNMSI(T2G, T2z), ms, &(x[WS(rs, 1)]));
Chris@42 419 }
Chris@42 420 }
Chris@42 421 }
Chris@42 422 VLEAVE();
Chris@42 423 }
Chris@42 424
Chris@42 425 static const tw_instr twinstr[] = {
Chris@42 426 VTW(0, 1),
Chris@42 427 VTW(0, 3),
Chris@42 428 VTW(0, 9),
Chris@42 429 VTW(0, 27),
Chris@42 430 {TW_NEXT, VL, 0}
Chris@42 431 };
Chris@42 432
Chris@42 433 static const ct_desc desc = { 32, XSIMD_STRING("t3bv_32"), twinstr, &GENUS, {146, 116, 98, 0}, 0, 0, 0 };
Chris@42 434
Chris@42 435 void XSIMD(codelet_t3bv_32) (planner *p) {
Chris@42 436 X(kdft_dit_register) (p, t3bv_32, &desc);
Chris@42 437 }
Chris@42 438 #else /* HAVE_FMA */
Chris@42 439
Chris@42 440 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3bv_32 -include t3b.h -sign 1 */
Chris@42 441
Chris@42 442 /*
Chris@42 443 * This function contains 244 FP additions, 158 FP multiplications,
Chris@42 444 * (or, 228 additions, 142 multiplications, 16 fused multiply/add),
Chris@42 445 * 90 stack variables, 7 constants, and 64 memory accesses
Chris@42 446 */
Chris@42 447 #include "t3b.h"
Chris@42 448
Chris@42 449 static void t3bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 450 {
Chris@42 451 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@42 452 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 453 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@42 454 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 455 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 456 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 457 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 458 {
Chris@42 459 INT m;
Chris@42 460 R *x;
Chris@42 461 x = ii;
Chris@42 462 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 463 V T2, T5, T3, T4, Tc, T1v, TH, Tz, Tn, T6, TS, Tf, TK, T7, T8;
Chris@42 464 V Tv, T1I, T25, Tg, Tk, T1N, T1Q, TC, T16, T12, T1w, TL, TP, TT, T1m;
Chris@42 465 V T1f;
Chris@42 466 T2 = LDW(&(W[0]));
Chris@42 467 T5 = LDW(&(W[TWVL * 4]));
Chris@42 468 T3 = LDW(&(W[TWVL * 2]));
Chris@42 469 T4 = VZMULJ(T2, T3);
Chris@42 470 Tc = VZMUL(T2, T3);
Chris@42 471 T1v = VZMULJ(T2, T5);
Chris@42 472 TH = VZMULJ(T3, T5);
Chris@42 473 Tz = VZMUL(T2, T5);
Chris@42 474 Tn = VZMUL(T3, T5);
Chris@42 475 T6 = VZMUL(T4, T5);
Chris@42 476 TS = VZMUL(Tc, T5);
Chris@42 477 Tf = VZMULJ(T4, T5);
Chris@42 478 TK = VZMULJ(Tc, T5);
Chris@42 479 T7 = LDW(&(W[TWVL * 6]));
Chris@42 480 T8 = VZMULJ(T6, T7);
Chris@42 481 Tv = VZMULJ(T5, T7);
Chris@42 482 T1I = VZMULJ(Tc, T7);
Chris@42 483 T25 = VZMULJ(T3, T7);
Chris@42 484 Tg = VZMULJ(Tf, T7);
Chris@42 485 Tk = VZMUL(T2, T7);
Chris@42 486 T1N = VZMUL(Tc, T7);
Chris@42 487 T1Q = VZMULJ(Tn, T7);
Chris@42 488 TC = VZMULJ(T2, T7);
Chris@42 489 T16 = VZMUL(T4, T7);
Chris@42 490 T12 = VZMULJ(TH, T7);
Chris@42 491 T1w = VZMULJ(T1v, T7);
Chris@42 492 TL = VZMULJ(TK, T7);
Chris@42 493 TP = VZMUL(T3, T7);
Chris@42 494 TT = VZMULJ(TS, T7);
Chris@42 495 T1m = VZMULJ(Tz, T7);
Chris@42 496 T1f = VZMULJ(T4, T7);
Chris@42 497 {
Chris@42 498 V Tb, T28, T3k, T3M, Tr, T22, T3f, T3N, TX, T20, T3b, T3J, TG, T1Z, T38;
Chris@42 499 V T3I, T1M, T2v, T33, T3F, T1V, T2w, T30, T3E, T1j, T2s, T2W, T3C, T1r, T2t;
Chris@42 500 V T2T, T3B;
Chris@42 501 {
Chris@42 502 V T1, T27, Ta, T24, T26, T9, T23, T3i, T3j;
Chris@42 503 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 504 T26 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@42 505 T27 = VZMUL(T25, T26);
Chris@42 506 T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@42 507 Ta = VZMUL(T8, T9);
Chris@42 508 T23 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 509 T24 = VZMUL(T1v, T23);
Chris@42 510 Tb = VSUB(T1, Ta);
Chris@42 511 T28 = VSUB(T24, T27);
Chris@42 512 T3i = VADD(T1, Ta);
Chris@42 513 T3j = VADD(T24, T27);
Chris@42 514 T3k = VSUB(T3i, T3j);
Chris@42 515 T3M = VADD(T3i, T3j);
Chris@42 516 }
Chris@42 517 {
Chris@42 518 V Te, Tp, Ti, Tm;
Chris@42 519 {
Chris@42 520 V Td, To, Th, Tl;
Chris@42 521 Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 522 Te = VZMUL(Tc, Td);
Chris@42 523 To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 524 Tp = VZMUL(Tn, To);
Chris@42 525 Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@42 526 Ti = VZMUL(Tg, Th);
Chris@42 527 Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@42 528 Tm = VZMUL(Tk, Tl);
Chris@42 529 }
Chris@42 530 {
Chris@42 531 V Tj, Tq, T3d, T3e;
Chris@42 532 Tj = VSUB(Te, Ti);
Chris@42 533 Tq = VSUB(Tm, Tp);
Chris@42 534 Tr = VMUL(LDK(KP707106781), VADD(Tj, Tq));
Chris@42 535 T22 = VMUL(LDK(KP707106781), VSUB(Tj, Tq));
Chris@42 536 T3d = VADD(Te, Ti);
Chris@42 537 T3e = VADD(Tm, Tp);
Chris@42 538 T3f = VSUB(T3d, T3e);
Chris@42 539 T3N = VADD(T3d, T3e);
Chris@42 540 }
Chris@42 541 }
Chris@42 542 {
Chris@42 543 V TJ, TV, TN, TR;
Chris@42 544 {
Chris@42 545 V TI, TU, TM, TQ;
Chris@42 546 TI = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 547 TJ = VZMUL(TH, TI);
Chris@42 548 TU = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 549 TV = VZMUL(TT, TU);
Chris@42 550 TM = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@42 551 TN = VZMUL(TL, TM);
Chris@42 552 TQ = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@42 553 TR = VZMUL(TP, TQ);
Chris@42 554 }
Chris@42 555 {
Chris@42 556 V TO, TW, T39, T3a;
Chris@42 557 TO = VSUB(TJ, TN);
Chris@42 558 TW = VSUB(TR, TV);
Chris@42 559 TX = VFNMS(LDK(KP382683432), TW, VMUL(LDK(KP923879532), TO));
Chris@42 560 T20 = VFMA(LDK(KP923879532), TW, VMUL(LDK(KP382683432), TO));
Chris@42 561 T39 = VADD(TR, TV);
Chris@42 562 T3a = VADD(TJ, TN);
Chris@42 563 T3b = VSUB(T39, T3a);
Chris@42 564 T3J = VADD(T39, T3a);
Chris@42 565 }
Chris@42 566 }
Chris@42 567 {
Chris@42 568 V Tu, TE, Tx, TB;
Chris@42 569 {
Chris@42 570 V Tt, TD, Tw, TA;
Chris@42 571 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 572 Tu = VZMUL(T4, Tt);
Chris@42 573 TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@42 574 TE = VZMUL(TC, TD);
Chris@42 575 Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@42 576 Tx = VZMUL(Tv, Tw);
Chris@42 577 TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 578 TB = VZMUL(Tz, TA);
Chris@42 579 }
Chris@42 580 {
Chris@42 581 V Ty, TF, T36, T37;
Chris@42 582 Ty = VSUB(Tu, Tx);
Chris@42 583 TF = VSUB(TB, TE);
Chris@42 584 TG = VFMA(LDK(KP382683432), Ty, VMUL(LDK(KP923879532), TF));
Chris@42 585 T1Z = VFNMS(LDK(KP382683432), TF, VMUL(LDK(KP923879532), Ty));
Chris@42 586 T36 = VADD(Tu, Tx);
Chris@42 587 T37 = VADD(TB, TE);
Chris@42 588 T38 = VSUB(T36, T37);
Chris@42 589 T3I = VADD(T36, T37);
Chris@42 590 }
Chris@42 591 }
Chris@42 592 {
Chris@42 593 V T1H, T1K, T1S, T1P, T1B, T1D, T1E, T1u, T1y, T1z;
Chris@42 594 {
Chris@42 595 V T1G, T1J, T1R, T1O;
Chris@42 596 T1G = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 597 T1H = VZMUL(Tf, T1G);
Chris@42 598 T1J = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@42 599 T1K = VZMUL(T1I, T1J);
Chris@42 600 T1R = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 601 T1S = VZMUL(T1Q, T1R);
Chris@42 602 T1O = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@42 603 T1P = VZMUL(T1N, T1O);
Chris@42 604 {
Chris@42 605 V T1A, T1C, T1t, T1x;
Chris@42 606 T1A = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@42 607 T1B = VZMUL(T7, T1A);
Chris@42 608 T1C = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 609 T1D = VZMUL(T6, T1C);
Chris@42 610 T1E = VSUB(T1B, T1D);
Chris@42 611 T1t = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 612 T1u = VZMUL(T3, T1t);
Chris@42 613 T1x = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@42 614 T1y = VZMUL(T1w, T1x);
Chris@42 615 T1z = VSUB(T1u, T1y);
Chris@42 616 }
Chris@42 617 }
Chris@42 618 {
Chris@42 619 V T1F, T1L, T31, T32;
Chris@42 620 T1F = VMUL(LDK(KP707106781), VSUB(T1z, T1E));
Chris@42 621 T1L = VSUB(T1H, T1K);
Chris@42 622 T1M = VSUB(T1F, T1L);
Chris@42 623 T2v = VADD(T1L, T1F);
Chris@42 624 T31 = VADD(T1u, T1y);
Chris@42 625 T32 = VADD(T1B, T1D);
Chris@42 626 T33 = VSUB(T31, T32);
Chris@42 627 T3F = VADD(T31, T32);
Chris@42 628 }
Chris@42 629 {
Chris@42 630 V T1T, T1U, T2Y, T2Z;
Chris@42 631 T1T = VSUB(T1P, T1S);
Chris@42 632 T1U = VMUL(LDK(KP707106781), VADD(T1z, T1E));
Chris@42 633 T1V = VSUB(T1T, T1U);
Chris@42 634 T2w = VADD(T1T, T1U);
Chris@42 635 T2Y = VADD(T1P, T1S);
Chris@42 636 T2Z = VADD(T1H, T1K);
Chris@42 637 T30 = VSUB(T2Y, T2Z);
Chris@42 638 T3E = VADD(T2Y, T2Z);
Chris@42 639 }
Chris@42 640 }
Chris@42 641 {
Chris@42 642 V T1e, T1h, T1o, T1l, T18, T1a, T1b, T11, T14, T15;
Chris@42 643 {
Chris@42 644 V T1d, T1g, T1n, T1k;
Chris@42 645 T1d = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 646 T1e = VZMUL(T5, T1d);
Chris@42 647 T1g = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@42 648 T1h = VZMUL(T1f, T1g);
Chris@42 649 T1n = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@42 650 T1o = VZMUL(T1m, T1n);
Chris@42 651 T1k = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 652 T1l = VZMUL(T2, T1k);
Chris@42 653 {
Chris@42 654 V T17, T19, T10, T13;
Chris@42 655 T17 = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@42 656 T18 = VZMUL(T16, T17);
Chris@42 657 T19 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 658 T1a = VZMUL(TS, T19);
Chris@42 659 T1b = VSUB(T18, T1a);
Chris@42 660 T10 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 661 T11 = VZMUL(TK, T10);
Chris@42 662 T13 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@42 663 T14 = VZMUL(T12, T13);
Chris@42 664 T15 = VSUB(T11, T14);
Chris@42 665 }
Chris@42 666 }
Chris@42 667 {
Chris@42 668 V T1c, T1i, T2U, T2V;
Chris@42 669 T1c = VMUL(LDK(KP707106781), VSUB(T15, T1b));
Chris@42 670 T1i = VSUB(T1e, T1h);
Chris@42 671 T1j = VSUB(T1c, T1i);
Chris@42 672 T2s = VADD(T1i, T1c);
Chris@42 673 T2U = VADD(T11, T14);
Chris@42 674 T2V = VADD(T18, T1a);
Chris@42 675 T2W = VSUB(T2U, T2V);
Chris@42 676 T3C = VADD(T2U, T2V);
Chris@42 677 }
Chris@42 678 {
Chris@42 679 V T1p, T1q, T2R, T2S;
Chris@42 680 T1p = VSUB(T1l, T1o);
Chris@42 681 T1q = VMUL(LDK(KP707106781), VADD(T15, T1b));
Chris@42 682 T1r = VSUB(T1p, T1q);
Chris@42 683 T2t = VADD(T1p, T1q);
Chris@42 684 T2R = VADD(T1l, T1o);
Chris@42 685 T2S = VADD(T1e, T1h);
Chris@42 686 T2T = VSUB(T2R, T2S);
Chris@42 687 T3B = VADD(T2R, T2S);
Chris@42 688 }
Chris@42 689 }
Chris@42 690 {
Chris@42 691 V T3V, T3Z, T3Y, T40;
Chris@42 692 {
Chris@42 693 V T3T, T3U, T3W, T3X;
Chris@42 694 T3T = VADD(T3M, T3N);
Chris@42 695 T3U = VADD(T3I, T3J);
Chris@42 696 T3V = VSUB(T3T, T3U);
Chris@42 697 T3Z = VADD(T3T, T3U);
Chris@42 698 T3W = VADD(T3B, T3C);
Chris@42 699 T3X = VADD(T3E, T3F);
Chris@42 700 T3Y = VBYI(VSUB(T3W, T3X));
Chris@42 701 T40 = VADD(T3W, T3X);
Chris@42 702 }
Chris@42 703 ST(&(x[WS(rs, 24)]), VSUB(T3V, T3Y), ms, &(x[0]));
Chris@42 704 ST(&(x[0]), VADD(T3Z, T40), ms, &(x[0]));
Chris@42 705 ST(&(x[WS(rs, 8)]), VADD(T3V, T3Y), ms, &(x[0]));
Chris@42 706 ST(&(x[WS(rs, 16)]), VSUB(T3Z, T40), ms, &(x[0]));
Chris@42 707 }
Chris@42 708 {
Chris@42 709 V T3K, T3O, T3H, T3P, T3D, T3G;
Chris@42 710 T3K = VSUB(T3I, T3J);
Chris@42 711 T3O = VSUB(T3M, T3N);
Chris@42 712 T3D = VSUB(T3B, T3C);
Chris@42 713 T3G = VSUB(T3E, T3F);
Chris@42 714 T3H = VMUL(LDK(KP707106781), VSUB(T3D, T3G));
Chris@42 715 T3P = VMUL(LDK(KP707106781), VADD(T3D, T3G));
Chris@42 716 {
Chris@42 717 V T3L, T3Q, T3R, T3S;
Chris@42 718 T3L = VBYI(VSUB(T3H, T3K));
Chris@42 719 T3Q = VSUB(T3O, T3P);
Chris@42 720 ST(&(x[WS(rs, 12)]), VADD(T3L, T3Q), ms, &(x[0]));
Chris@42 721 ST(&(x[WS(rs, 20)]), VSUB(T3Q, T3L), ms, &(x[0]));
Chris@42 722 T3R = VBYI(VADD(T3K, T3H));
Chris@42 723 T3S = VADD(T3O, T3P);
Chris@42 724 ST(&(x[WS(rs, 4)]), VADD(T3R, T3S), ms, &(x[0]));
Chris@42 725 ST(&(x[WS(rs, 28)]), VSUB(T3S, T3R), ms, &(x[0]));
Chris@42 726 }
Chris@42 727 }
Chris@42 728 {
Chris@42 729 V T3g, T3w, T3m, T3t, T35, T3u, T3p, T3x, T3c, T3l;
Chris@42 730 T3c = VMUL(LDK(KP707106781), VSUB(T38, T3b));
Chris@42 731 T3g = VSUB(T3c, T3f);
Chris@42 732 T3w = VADD(T3f, T3c);
Chris@42 733 T3l = VMUL(LDK(KP707106781), VADD(T38, T3b));
Chris@42 734 T3m = VSUB(T3k, T3l);
Chris@42 735 T3t = VADD(T3k, T3l);
Chris@42 736 {
Chris@42 737 V T2X, T34, T3n, T3o;
Chris@42 738 T2X = VFNMS(LDK(KP382683432), T2W, VMUL(LDK(KP923879532), T2T));
Chris@42 739 T34 = VFMA(LDK(KP923879532), T30, VMUL(LDK(KP382683432), T33));
Chris@42 740 T35 = VSUB(T2X, T34);
Chris@42 741 T3u = VADD(T2X, T34);
Chris@42 742 T3n = VFMA(LDK(KP382683432), T2T, VMUL(LDK(KP923879532), T2W));
Chris@42 743 T3o = VFNMS(LDK(KP382683432), T30, VMUL(LDK(KP923879532), T33));
Chris@42 744 T3p = VSUB(T3n, T3o);
Chris@42 745 T3x = VADD(T3n, T3o);
Chris@42 746 }
Chris@42 747 {
Chris@42 748 V T3h, T3q, T3z, T3A;
Chris@42 749 T3h = VBYI(VSUB(T35, T3g));
Chris@42 750 T3q = VSUB(T3m, T3p);
Chris@42 751 ST(&(x[WS(rs, 10)]), VADD(T3h, T3q), ms, &(x[0]));
Chris@42 752 ST(&(x[WS(rs, 22)]), VSUB(T3q, T3h), ms, &(x[0]));
Chris@42 753 T3z = VSUB(T3t, T3u);
Chris@42 754 T3A = VBYI(VSUB(T3x, T3w));
Chris@42 755 ST(&(x[WS(rs, 18)]), VSUB(T3z, T3A), ms, &(x[0]));
Chris@42 756 ST(&(x[WS(rs, 14)]), VADD(T3z, T3A), ms, &(x[0]));
Chris@42 757 }
Chris@42 758 {
Chris@42 759 V T3r, T3s, T3v, T3y;
Chris@42 760 T3r = VBYI(VADD(T3g, T35));
Chris@42 761 T3s = VADD(T3m, T3p);
Chris@42 762 ST(&(x[WS(rs, 6)]), VADD(T3r, T3s), ms, &(x[0]));
Chris@42 763 ST(&(x[WS(rs, 26)]), VSUB(T3s, T3r), ms, &(x[0]));
Chris@42 764 T3v = VADD(T3t, T3u);
Chris@42 765 T3y = VBYI(VADD(T3w, T3x));
Chris@42 766 ST(&(x[WS(rs, 30)]), VSUB(T3v, T3y), ms, &(x[0]));
Chris@42 767 ST(&(x[WS(rs, 2)]), VADD(T3v, T3y), ms, &(x[0]));
Chris@42 768 }
Chris@42 769 }
Chris@42 770 {
Chris@42 771 V TZ, T2k, T2d, T2l, T1X, T2h, T2a, T2i;
Chris@42 772 {
Chris@42 773 V Ts, TY, T2b, T2c;
Chris@42 774 Ts = VSUB(Tb, Tr);
Chris@42 775 TY = VSUB(TG, TX);
Chris@42 776 TZ = VSUB(Ts, TY);
Chris@42 777 T2k = VADD(Ts, TY);
Chris@42 778 T2b = VFNMS(LDK(KP555570233), T1j, VMUL(LDK(KP831469612), T1r));
Chris@42 779 T2c = VFMA(LDK(KP555570233), T1M, VMUL(LDK(KP831469612), T1V));
Chris@42 780 T2d = VSUB(T2b, T2c);
Chris@42 781 T2l = VADD(T2b, T2c);
Chris@42 782 }
Chris@42 783 {
Chris@42 784 V T1s, T1W, T21, T29;
Chris@42 785 T1s = VFMA(LDK(KP831469612), T1j, VMUL(LDK(KP555570233), T1r));
Chris@42 786 T1W = VFNMS(LDK(KP555570233), T1V, VMUL(LDK(KP831469612), T1M));
Chris@42 787 T1X = VSUB(T1s, T1W);
Chris@42 788 T2h = VADD(T1s, T1W);
Chris@42 789 T21 = VSUB(T1Z, T20);
Chris@42 790 T29 = VSUB(T22, T28);
Chris@42 791 T2a = VSUB(T21, T29);
Chris@42 792 T2i = VADD(T29, T21);
Chris@42 793 }
Chris@42 794 {
Chris@42 795 V T1Y, T2e, T2n, T2o;
Chris@42 796 T1Y = VADD(TZ, T1X);
Chris@42 797 T2e = VBYI(VADD(T2a, T2d));
Chris@42 798 ST(&(x[WS(rs, 27)]), VSUB(T1Y, T2e), ms, &(x[WS(rs, 1)]));
Chris@42 799 ST(&(x[WS(rs, 5)]), VADD(T1Y, T2e), ms, &(x[WS(rs, 1)]));
Chris@42 800 T2n = VBYI(VADD(T2i, T2h));
Chris@42 801 T2o = VADD(T2k, T2l);
Chris@42 802 ST(&(x[WS(rs, 3)]), VADD(T2n, T2o), ms, &(x[WS(rs, 1)]));
Chris@42 803 ST(&(x[WS(rs, 29)]), VSUB(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@42 804 }
Chris@42 805 {
Chris@42 806 V T2f, T2g, T2j, T2m;
Chris@42 807 T2f = VSUB(TZ, T1X);
Chris@42 808 T2g = VBYI(VSUB(T2d, T2a));
Chris@42 809 ST(&(x[WS(rs, 21)]), VSUB(T2f, T2g), ms, &(x[WS(rs, 1)]));
Chris@42 810 ST(&(x[WS(rs, 11)]), VADD(T2f, T2g), ms, &(x[WS(rs, 1)]));
Chris@42 811 T2j = VBYI(VSUB(T2h, T2i));
Chris@42 812 T2m = VSUB(T2k, T2l);
Chris@42 813 ST(&(x[WS(rs, 13)]), VADD(T2j, T2m), ms, &(x[WS(rs, 1)]));
Chris@42 814 ST(&(x[WS(rs, 19)]), VSUB(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@42 815 }
Chris@42 816 }
Chris@42 817 {
Chris@42 818 V T2r, T2M, T2F, T2N, T2y, T2J, T2C, T2K;
Chris@42 819 {
Chris@42 820 V T2p, T2q, T2D, T2E;
Chris@42 821 T2p = VADD(Tb, Tr);
Chris@42 822 T2q = VADD(T1Z, T20);
Chris@42 823 T2r = VSUB(T2p, T2q);
Chris@42 824 T2M = VADD(T2p, T2q);
Chris@42 825 T2D = VFNMS(LDK(KP195090322), T2s, VMUL(LDK(KP980785280), T2t));
Chris@42 826 T2E = VFMA(LDK(KP195090322), T2v, VMUL(LDK(KP980785280), T2w));
Chris@42 827 T2F = VSUB(T2D, T2E);
Chris@42 828 T2N = VADD(T2D, T2E);
Chris@42 829 }
Chris@42 830 {
Chris@42 831 V T2u, T2x, T2A, T2B;
Chris@42 832 T2u = VFMA(LDK(KP980785280), T2s, VMUL(LDK(KP195090322), T2t));
Chris@42 833 T2x = VFNMS(LDK(KP195090322), T2w, VMUL(LDK(KP980785280), T2v));
Chris@42 834 T2y = VSUB(T2u, T2x);
Chris@42 835 T2J = VADD(T2u, T2x);
Chris@42 836 T2A = VADD(TG, TX);
Chris@42 837 T2B = VADD(T28, T22);
Chris@42 838 T2C = VSUB(T2A, T2B);
Chris@42 839 T2K = VADD(T2B, T2A);
Chris@42 840 }
Chris@42 841 {
Chris@42 842 V T2z, T2G, T2P, T2Q;
Chris@42 843 T2z = VADD(T2r, T2y);
Chris@42 844 T2G = VBYI(VADD(T2C, T2F));
Chris@42 845 ST(&(x[WS(rs, 25)]), VSUB(T2z, T2G), ms, &(x[WS(rs, 1)]));
Chris@42 846 ST(&(x[WS(rs, 7)]), VADD(T2z, T2G), ms, &(x[WS(rs, 1)]));
Chris@42 847 T2P = VBYI(VADD(T2K, T2J));
Chris@42 848 T2Q = VADD(T2M, T2N);
Chris@42 849 ST(&(x[WS(rs, 1)]), VADD(T2P, T2Q), ms, &(x[WS(rs, 1)]));
Chris@42 850 ST(&(x[WS(rs, 31)]), VSUB(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@42 851 }
Chris@42 852 {
Chris@42 853 V T2H, T2I, T2L, T2O;
Chris@42 854 T2H = VSUB(T2r, T2y);
Chris@42 855 T2I = VBYI(VSUB(T2F, T2C));
Chris@42 856 ST(&(x[WS(rs, 23)]), VSUB(T2H, T2I), ms, &(x[WS(rs, 1)]));
Chris@42 857 ST(&(x[WS(rs, 9)]), VADD(T2H, T2I), ms, &(x[WS(rs, 1)]));
Chris@42 858 T2L = VBYI(VSUB(T2J, T2K));
Chris@42 859 T2O = VSUB(T2M, T2N);
Chris@42 860 ST(&(x[WS(rs, 15)]), VADD(T2L, T2O), ms, &(x[WS(rs, 1)]));
Chris@42 861 ST(&(x[WS(rs, 17)]), VSUB(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@42 862 }
Chris@42 863 }
Chris@42 864 }
Chris@42 865 }
Chris@42 866 }
Chris@42 867 VLEAVE();
Chris@42 868 }
Chris@42 869
Chris@42 870 static const tw_instr twinstr[] = {
Chris@42 871 VTW(0, 1),
Chris@42 872 VTW(0, 3),
Chris@42 873 VTW(0, 9),
Chris@42 874 VTW(0, 27),
Chris@42 875 {TW_NEXT, VL, 0}
Chris@42 876 };
Chris@42 877
Chris@42 878 static const ct_desc desc = { 32, XSIMD_STRING("t3bv_32"), twinstr, &GENUS, {228, 142, 16, 0}, 0, 0, 0 };
Chris@42 879
Chris@42 880 void XSIMD(codelet_t3bv_32) (planner *p) {
Chris@42 881 X(kdft_dit_register) (p, t3bv_32, &desc);
Chris@42 882 }
Chris@42 883 #endif /* HAVE_FMA */