annotate src/fftw-3.3.8/dft/simd/common/t3bv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:06 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3bv_32 -include dft/simd/t3b.h -sign 1 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 244 FP additions, 214 FP multiplications,
Chris@82 32 * (or, 146 additions, 116 multiplications, 98 fused multiply/add),
Chris@82 33 * 90 stack variables, 7 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t3b.h"
Chris@82 36
Chris@82 37 static void t3bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 40 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 41 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 44 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 46 {
Chris@82 47 INT m;
Chris@82 48 R *x;
Chris@82 49 x = ii;
Chris@82 50 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 51 V T2, T5, T3, T4, Tc, T1C, TT, Tz, Tn, T6, TP, Tf, TK, T7, T8;
Chris@82 52 V Tv, T1w, T21, Tg, Tk, T1D, T1O, TC, T18, T12, T1t, TH, TL, TQ, T1m;
Chris@82 53 V T1c;
Chris@82 54 T2 = LDW(&(W[0]));
Chris@82 55 T5 = LDW(&(W[TWVL * 4]));
Chris@82 56 T3 = LDW(&(W[TWVL * 2]));
Chris@82 57 T4 = VZMULJ(T2, T3);
Chris@82 58 Tc = VZMUL(T2, T3);
Chris@82 59 T1C = VZMULJ(T2, T5);
Chris@82 60 TT = VZMULJ(T3, T5);
Chris@82 61 Tz = VZMUL(T2, T5);
Chris@82 62 Tn = VZMUL(T3, T5);
Chris@82 63 T6 = VZMUL(T4, T5);
Chris@82 64 TP = VZMULJ(Tc, T5);
Chris@82 65 Tf = VZMULJ(T4, T5);
Chris@82 66 TK = VZMUL(Tc, T5);
Chris@82 67 T7 = LDW(&(W[TWVL * 6]));
Chris@82 68 T8 = VZMULJ(T6, T7);
Chris@82 69 Tv = VZMULJ(T5, T7);
Chris@82 70 T1w = VZMULJ(Tn, T7);
Chris@82 71 T21 = VZMULJ(T3, T7);
Chris@82 72 Tg = VZMULJ(Tf, T7);
Chris@82 73 Tk = VZMUL(T2, T7);
Chris@82 74 T1D = VZMULJ(T1C, T7);
Chris@82 75 T1O = VZMULJ(Tc, T7);
Chris@82 76 TC = VZMULJ(T2, T7);
Chris@82 77 T18 = VZMULJ(TT, T7);
Chris@82 78 T12 = VZMULJ(Tz, T7);
Chris@82 79 T1t = VZMUL(Tc, T7);
Chris@82 80 TH = VZMUL(T3, T7);
Chris@82 81 TL = VZMULJ(TK, T7);
Chris@82 82 TQ = VZMULJ(TP, T7);
Chris@82 83 T1m = VZMULJ(T4, T7);
Chris@82 84 T1c = VZMUL(T4, T7);
Chris@82 85 {
Chris@82 86 V Tb, T24, T2T, T3x, Tr, T25, T2W, T3K, TX, T28, T3j, T3z, TG, T27, T3g;
Chris@82 87 V T3y, T1N, T2v, T3a, T3G, T1V, T2w, T37, T3F, T1j, T2s, T33, T3D, T1r, T2t;
Chris@82 88 V T30, T3C;
Chris@82 89 {
Chris@82 90 V T1, T23, Ta, T20, T22, T9, T1Z, T2R, T2S;
Chris@82 91 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 92 T22 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@82 93 T23 = VZMUL(T21, T22);
Chris@82 94 T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 95 Ta = VZMUL(T8, T9);
Chris@82 96 T1Z = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 97 T20 = VZMUL(T1C, T1Z);
Chris@82 98 Tb = VSUB(T1, Ta);
Chris@82 99 T24 = VSUB(T20, T23);
Chris@82 100 T2R = VADD(T1, Ta);
Chris@82 101 T2S = VADD(T20, T23);
Chris@82 102 T2T = VADD(T2R, T2S);
Chris@82 103 T3x = VSUB(T2R, T2S);
Chris@82 104 }
Chris@82 105 {
Chris@82 106 V Te, Tp, Ti, Tm;
Chris@82 107 {
Chris@82 108 V Td, To, Th, Tl;
Chris@82 109 Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 110 Te = VZMUL(Tc, Td);
Chris@82 111 To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 112 Tp = VZMUL(Tn, To);
Chris@82 113 Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@82 114 Ti = VZMUL(Tg, Th);
Chris@82 115 Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@82 116 Tm = VZMUL(Tk, Tl);
Chris@82 117 }
Chris@82 118 {
Chris@82 119 V Tj, Tq, T2U, T2V;
Chris@82 120 Tj = VSUB(Te, Ti);
Chris@82 121 Tq = VSUB(Tm, Tp);
Chris@82 122 Tr = VADD(Tj, Tq);
Chris@82 123 T25 = VSUB(Tj, Tq);
Chris@82 124 T2U = VADD(Te, Ti);
Chris@82 125 T2V = VADD(Tm, Tp);
Chris@82 126 T2W = VADD(T2U, T2V);
Chris@82 127 T3K = VSUB(T2U, T2V);
Chris@82 128 }
Chris@82 129 }
Chris@82 130 {
Chris@82 131 V TJ, TV, TN, TS;
Chris@82 132 {
Chris@82 133 V TI, TU, TM, TR;
Chris@82 134 TI = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@82 135 TJ = VZMUL(TH, TI);
Chris@82 136 TU = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 137 TV = VZMUL(TT, TU);
Chris@82 138 TM = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 139 TN = VZMUL(TL, TM);
Chris@82 140 TR = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@82 141 TS = VZMUL(TQ, TR);
Chris@82 142 }
Chris@82 143 {
Chris@82 144 V TO, TW, T3h, T3i;
Chris@82 145 TO = VSUB(TJ, TN);
Chris@82 146 TW = VSUB(TS, TV);
Chris@82 147 TX = VFNMS(LDK(KP414213562), TW, TO);
Chris@82 148 T28 = VFMA(LDK(KP414213562), TO, TW);
Chris@82 149 T3h = VADD(TJ, TN);
Chris@82 150 T3i = VADD(TV, TS);
Chris@82 151 T3j = VADD(T3h, T3i);
Chris@82 152 T3z = VSUB(T3h, T3i);
Chris@82 153 }
Chris@82 154 }
Chris@82 155 {
Chris@82 156 V Tu, TE, Tx, TB;
Chris@82 157 {
Chris@82 158 V Tt, TD, Tw, TA;
Chris@82 159 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 160 Tu = VZMUL(T4, Tt);
Chris@82 161 TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@82 162 TE = VZMUL(TC, TD);
Chris@82 163 Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 164 Tx = VZMUL(Tv, Tw);
Chris@82 165 TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 166 TB = VZMUL(Tz, TA);
Chris@82 167 }
Chris@82 168 {
Chris@82 169 V Ty, TF, T3e, T3f;
Chris@82 170 Ty = VSUB(Tu, Tx);
Chris@82 171 TF = VSUB(TB, TE);
Chris@82 172 TG = VFNMS(LDK(KP414213562), TF, Ty);
Chris@82 173 T27 = VFMA(LDK(KP414213562), Ty, TF);
Chris@82 174 T3e = VADD(Tu, Tx);
Chris@82 175 T3f = VADD(TB, TE);
Chris@82 176 T3g = VADD(T3e, T3f);
Chris@82 177 T3y = VSUB(T3e, T3f);
Chris@82 178 }
Chris@82 179 }
Chris@82 180 {
Chris@82 181 V T1v, T1y, T1S, T1Q, T1I, T1K, T1L, T1B, T1F, T1G;
Chris@82 182 {
Chris@82 183 V T1u, T1x, T1R, T1P;
Chris@82 184 T1u = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@82 185 T1v = VZMUL(T1t, T1u);
Chris@82 186 T1x = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 187 T1y = VZMUL(T1w, T1x);
Chris@82 188 T1R = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 189 T1S = VZMUL(Tf, T1R);
Chris@82 190 T1P = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@82 191 T1Q = VZMUL(T1O, T1P);
Chris@82 192 {
Chris@82 193 V T1H, T1J, T1A, T1E;
Chris@82 194 T1H = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@82 195 T1I = VZMUL(T7, T1H);
Chris@82 196 T1J = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 197 T1K = VZMUL(T6, T1J);
Chris@82 198 T1L = VSUB(T1I, T1K);
Chris@82 199 T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 200 T1B = VZMUL(T3, T1A);
Chris@82 201 T1E = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 202 T1F = VZMUL(T1D, T1E);
Chris@82 203 T1G = VSUB(T1B, T1F);
Chris@82 204 }
Chris@82 205 }
Chris@82 206 {
Chris@82 207 V T1z, T1M, T38, T39;
Chris@82 208 T1z = VSUB(T1v, T1y);
Chris@82 209 T1M = VADD(T1G, T1L);
Chris@82 210 T1N = VFMA(LDK(KP707106781), T1M, T1z);
Chris@82 211 T2v = VFNMS(LDK(KP707106781), T1M, T1z);
Chris@82 212 T38 = VADD(T1B, T1F);
Chris@82 213 T39 = VADD(T1I, T1K);
Chris@82 214 T3a = VADD(T38, T39);
Chris@82 215 T3G = VSUB(T39, T38);
Chris@82 216 }
Chris@82 217 {
Chris@82 218 V T1T, T1U, T35, T36;
Chris@82 219 T1T = VSUB(T1Q, T1S);
Chris@82 220 T1U = VSUB(T1L, T1G);
Chris@82 221 T1V = VFMA(LDK(KP707106781), T1U, T1T);
Chris@82 222 T2w = VFNMS(LDK(KP707106781), T1U, T1T);
Chris@82 223 T35 = VADD(T1v, T1y);
Chris@82 224 T36 = VADD(T1S, T1Q);
Chris@82 225 T37 = VADD(T35, T36);
Chris@82 226 T3F = VSUB(T35, T36);
Chris@82 227 }
Chris@82 228 }
Chris@82 229 {
Chris@82 230 V T11, T14, T1o, T1l, T1e, T1g, T1h, T17, T1a, T1b;
Chris@82 231 {
Chris@82 232 V T10, T13, T1n, T1k;
Chris@82 233 T10 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 234 T11 = VZMUL(T2, T10);
Chris@82 235 T13 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 236 T14 = VZMUL(T12, T13);
Chris@82 237 T1n = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@82 238 T1o = VZMUL(T1m, T1n);
Chris@82 239 T1k = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 240 T1l = VZMUL(T5, T1k);
Chris@82 241 {
Chris@82 242 V T1d, T1f, T16, T19;
Chris@82 243 T1d = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@82 244 T1e = VZMUL(T1c, T1d);
Chris@82 245 T1f = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 246 T1g = VZMUL(TK, T1f);
Chris@82 247 T1h = VSUB(T1e, T1g);
Chris@82 248 T16 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 249 T17 = VZMUL(TP, T16);
Chris@82 250 T19 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@82 251 T1a = VZMUL(T18, T19);
Chris@82 252 T1b = VSUB(T17, T1a);
Chris@82 253 }
Chris@82 254 }
Chris@82 255 {
Chris@82 256 V T15, T1i, T31, T32;
Chris@82 257 T15 = VSUB(T11, T14);
Chris@82 258 T1i = VADD(T1b, T1h);
Chris@82 259 T1j = VFMA(LDK(KP707106781), T1i, T15);
Chris@82 260 T2s = VFNMS(LDK(KP707106781), T1i, T15);
Chris@82 261 T31 = VADD(T17, T1a);
Chris@82 262 T32 = VADD(T1e, T1g);
Chris@82 263 T33 = VADD(T31, T32);
Chris@82 264 T3D = VSUB(T31, T32);
Chris@82 265 }
Chris@82 266 {
Chris@82 267 V T1p, T1q, T2Y, T2Z;
Chris@82 268 T1p = VSUB(T1l, T1o);
Chris@82 269 T1q = VSUB(T1b, T1h);
Chris@82 270 T1r = VFMA(LDK(KP707106781), T1q, T1p);
Chris@82 271 T2t = VFNMS(LDK(KP707106781), T1q, T1p);
Chris@82 272 T2Y = VADD(T11, T14);
Chris@82 273 T2Z = VADD(T1l, T1o);
Chris@82 274 T30 = VADD(T2Y, T2Z);
Chris@82 275 T3C = VSUB(T2Y, T2Z);
Chris@82 276 }
Chris@82 277 }
Chris@82 278 {
Chris@82 279 V T3r, T3v, T3u, T3w;
Chris@82 280 {
Chris@82 281 V T3p, T3q, T3s, T3t;
Chris@82 282 T3p = VADD(T2T, T2W);
Chris@82 283 T3q = VADD(T3g, T3j);
Chris@82 284 T3r = VSUB(T3p, T3q);
Chris@82 285 T3v = VADD(T3p, T3q);
Chris@82 286 T3s = VADD(T30, T33);
Chris@82 287 T3t = VADD(T37, T3a);
Chris@82 288 T3u = VSUB(T3s, T3t);
Chris@82 289 T3w = VADD(T3s, T3t);
Chris@82 290 }
Chris@82 291 ST(&(x[WS(rs, 24)]), VFNMSI(T3u, T3r), ms, &(x[0]));
Chris@82 292 ST(&(x[0]), VADD(T3v, T3w), ms, &(x[0]));
Chris@82 293 ST(&(x[WS(rs, 8)]), VFMAI(T3u, T3r), ms, &(x[0]));
Chris@82 294 ST(&(x[WS(rs, 16)]), VSUB(T3v, T3w), ms, &(x[0]));
Chris@82 295 }
Chris@82 296 {
Chris@82 297 V T2X, T3k, T3c, T3l, T34, T3b;
Chris@82 298 T2X = VSUB(T2T, T2W);
Chris@82 299 T3k = VSUB(T3g, T3j);
Chris@82 300 T34 = VSUB(T30, T33);
Chris@82 301 T3b = VSUB(T37, T3a);
Chris@82 302 T3c = VADD(T34, T3b);
Chris@82 303 T3l = VSUB(T34, T3b);
Chris@82 304 {
Chris@82 305 V T3d, T3m, T3n, T3o;
Chris@82 306 T3d = VFNMS(LDK(KP707106781), T3c, T2X);
Chris@82 307 T3m = VFNMS(LDK(KP707106781), T3l, T3k);
Chris@82 308 ST(&(x[WS(rs, 12)]), VFNMSI(T3m, T3d), ms, &(x[0]));
Chris@82 309 ST(&(x[WS(rs, 20)]), VFMAI(T3m, T3d), ms, &(x[0]));
Chris@82 310 T3n = VFMA(LDK(KP707106781), T3c, T2X);
Chris@82 311 T3o = VFMA(LDK(KP707106781), T3l, T3k);
Chris@82 312 ST(&(x[WS(rs, 4)]), VFMAI(T3o, T3n), ms, &(x[0]));
Chris@82 313 ST(&(x[WS(rs, 28)]), VFNMSI(T3o, T3n), ms, &(x[0]));
Chris@82 314 }
Chris@82 315 }
Chris@82 316 {
Chris@82 317 V T3B, T3T, T3M, T3W, T3I, T3X, T3P, T3U, T3A, T3L;
Chris@82 318 T3A = VADD(T3y, T3z);
Chris@82 319 T3B = VFMA(LDK(KP707106781), T3A, T3x);
Chris@82 320 T3T = VFNMS(LDK(KP707106781), T3A, T3x);
Chris@82 321 T3L = VSUB(T3y, T3z);
Chris@82 322 T3M = VFMA(LDK(KP707106781), T3L, T3K);
Chris@82 323 T3W = VFNMS(LDK(KP707106781), T3L, T3K);
Chris@82 324 {
Chris@82 325 V T3E, T3H, T3N, T3O;
Chris@82 326 T3E = VFNMS(LDK(KP414213562), T3D, T3C);
Chris@82 327 T3H = VFNMS(LDK(KP414213562), T3G, T3F);
Chris@82 328 T3I = VADD(T3E, T3H);
Chris@82 329 T3X = VSUB(T3E, T3H);
Chris@82 330 T3N = VFMA(LDK(KP414213562), T3C, T3D);
Chris@82 331 T3O = VFMA(LDK(KP414213562), T3F, T3G);
Chris@82 332 T3P = VSUB(T3N, T3O);
Chris@82 333 T3U = VADD(T3N, T3O);
Chris@82 334 }
Chris@82 335 {
Chris@82 336 V T3J, T3Q, T3Z, T40;
Chris@82 337 T3J = VFNMS(LDK(KP923879532), T3I, T3B);
Chris@82 338 T3Q = VFNMS(LDK(KP923879532), T3P, T3M);
Chris@82 339 ST(&(x[WS(rs, 14)]), VFNMSI(T3Q, T3J), ms, &(x[0]));
Chris@82 340 ST(&(x[WS(rs, 18)]), VFMAI(T3Q, T3J), ms, &(x[0]));
Chris@82 341 T3Z = VFMA(LDK(KP923879532), T3U, T3T);
Chris@82 342 T40 = VFNMS(LDK(KP923879532), T3X, T3W);
Chris@82 343 ST(&(x[WS(rs, 6)]), VFNMSI(T40, T3Z), ms, &(x[0]));
Chris@82 344 ST(&(x[WS(rs, 26)]), VFMAI(T40, T3Z), ms, &(x[0]));
Chris@82 345 }
Chris@82 346 {
Chris@82 347 V T3R, T3S, T3V, T3Y;
Chris@82 348 T3R = VFMA(LDK(KP923879532), T3I, T3B);
Chris@82 349 T3S = VFMA(LDK(KP923879532), T3P, T3M);
Chris@82 350 ST(&(x[WS(rs, 30)]), VFNMSI(T3S, T3R), ms, &(x[0]));
Chris@82 351 ST(&(x[WS(rs, 2)]), VFMAI(T3S, T3R), ms, &(x[0]));
Chris@82 352 T3V = VFNMS(LDK(KP923879532), T3U, T3T);
Chris@82 353 T3Y = VFMA(LDK(KP923879532), T3X, T3W);
Chris@82 354 ST(&(x[WS(rs, 10)]), VFMAI(T3Y, T3V), ms, &(x[0]));
Chris@82 355 ST(&(x[WS(rs, 22)]), VFNMSI(T3Y, T3V), ms, &(x[0]));
Chris@82 356 }
Chris@82 357 }
Chris@82 358 {
Chris@82 359 V TZ, T2h, T2d, T2i, T1X, T2l, T2a, T2k;
Chris@82 360 {
Chris@82 361 V Ts, TY, T2b, T2c;
Chris@82 362 Ts = VFMA(LDK(KP707106781), Tr, Tb);
Chris@82 363 TY = VADD(TG, TX);
Chris@82 364 TZ = VFMA(LDK(KP923879532), TY, Ts);
Chris@82 365 T2h = VFNMS(LDK(KP923879532), TY, Ts);
Chris@82 366 T2b = VFMA(LDK(KP198912367), T1j, T1r);
Chris@82 367 T2c = VFMA(LDK(KP198912367), T1N, T1V);
Chris@82 368 T2d = VSUB(T2b, T2c);
Chris@82 369 T2i = VADD(T2b, T2c);
Chris@82 370 }
Chris@82 371 {
Chris@82 372 V T1s, T1W, T26, T29;
Chris@82 373 T1s = VFNMS(LDK(KP198912367), T1r, T1j);
Chris@82 374 T1W = VFNMS(LDK(KP198912367), T1V, T1N);
Chris@82 375 T1X = VADD(T1s, T1W);
Chris@82 376 T2l = VSUB(T1s, T1W);
Chris@82 377 T26 = VFMA(LDK(KP707106781), T25, T24);
Chris@82 378 T29 = VSUB(T27, T28);
Chris@82 379 T2a = VFMA(LDK(KP923879532), T29, T26);
Chris@82 380 T2k = VFNMS(LDK(KP923879532), T29, T26);
Chris@82 381 }
Chris@82 382 {
Chris@82 383 V T1Y, T2e, T2n, T2o;
Chris@82 384 T1Y = VFNMS(LDK(KP980785280), T1X, TZ);
Chris@82 385 T2e = VFNMS(LDK(KP980785280), T2d, T2a);
Chris@82 386 ST(&(x[WS(rs, 15)]), VFNMSI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
Chris@82 387 ST(&(x[WS(rs, 17)]), VFMAI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
Chris@82 388 T2n = VFMA(LDK(KP980785280), T2i, T2h);
Chris@82 389 T2o = VFNMS(LDK(KP980785280), T2l, T2k);
Chris@82 390 ST(&(x[WS(rs, 7)]), VFNMSI(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@82 391 ST(&(x[WS(rs, 25)]), VFMAI(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@82 392 }
Chris@82 393 {
Chris@82 394 V T2f, T2g, T2j, T2m;
Chris@82 395 T2f = VFMA(LDK(KP980785280), T1X, TZ);
Chris@82 396 T2g = VFMA(LDK(KP980785280), T2d, T2a);
Chris@82 397 ST(&(x[WS(rs, 31)]), VFNMSI(T2g, T2f), ms, &(x[WS(rs, 1)]));
Chris@82 398 ST(&(x[WS(rs, 1)]), VFMAI(T2g, T2f), ms, &(x[WS(rs, 1)]));
Chris@82 399 T2j = VFNMS(LDK(KP980785280), T2i, T2h);
Chris@82 400 T2m = VFMA(LDK(KP980785280), T2l, T2k);
Chris@82 401 ST(&(x[WS(rs, 9)]), VFMAI(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@82 402 ST(&(x[WS(rs, 23)]), VFNMSI(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@82 403 }
Chris@82 404 }
Chris@82 405 {
Chris@82 406 V T2r, T2J, T2F, T2K, T2y, T2N, T2C, T2M;
Chris@82 407 {
Chris@82 408 V T2p, T2q, T2D, T2E;
Chris@82 409 T2p = VFNMS(LDK(KP707106781), Tr, Tb);
Chris@82 410 T2q = VADD(T27, T28);
Chris@82 411 T2r = VFMA(LDK(KP923879532), T2q, T2p);
Chris@82 412 T2J = VFNMS(LDK(KP923879532), T2q, T2p);
Chris@82 413 T2D = VFNMS(LDK(KP668178637), T2s, T2t);
Chris@82 414 T2E = VFNMS(LDK(KP668178637), T2v, T2w);
Chris@82 415 T2F = VSUB(T2D, T2E);
Chris@82 416 T2K = VADD(T2D, T2E);
Chris@82 417 }
Chris@82 418 {
Chris@82 419 V T2u, T2x, T2A, T2B;
Chris@82 420 T2u = VFMA(LDK(KP668178637), T2t, T2s);
Chris@82 421 T2x = VFMA(LDK(KP668178637), T2w, T2v);
Chris@82 422 T2y = VADD(T2u, T2x);
Chris@82 423 T2N = VSUB(T2u, T2x);
Chris@82 424 T2A = VFNMS(LDK(KP707106781), T25, T24);
Chris@82 425 T2B = VSUB(TG, TX);
Chris@82 426 T2C = VFNMS(LDK(KP923879532), T2B, T2A);
Chris@82 427 T2M = VFMA(LDK(KP923879532), T2B, T2A);
Chris@82 428 }
Chris@82 429 {
Chris@82 430 V T2z, T2G, T2P, T2Q;
Chris@82 431 T2z = VFNMS(LDK(KP831469612), T2y, T2r);
Chris@82 432 T2G = VFNMS(LDK(KP831469612), T2F, T2C);
Chris@82 433 ST(&(x[WS(rs, 19)]), VFNMSI(T2G, T2z), ms, &(x[WS(rs, 1)]));
Chris@82 434 ST(&(x[WS(rs, 13)]), VFMAI(T2G, T2z), ms, &(x[WS(rs, 1)]));
Chris@82 435 T2P = VFNMS(LDK(KP831469612), T2K, T2J);
Chris@82 436 T2Q = VFMA(LDK(KP831469612), T2N, T2M);
Chris@82 437 ST(&(x[WS(rs, 5)]), VFMAI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@82 438 ST(&(x[WS(rs, 27)]), VFNMSI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@82 439 }
Chris@82 440 {
Chris@82 441 V T2H, T2I, T2L, T2O;
Chris@82 442 T2H = VFMA(LDK(KP831469612), T2y, T2r);
Chris@82 443 T2I = VFMA(LDK(KP831469612), T2F, T2C);
Chris@82 444 ST(&(x[WS(rs, 3)]), VFNMSI(T2I, T2H), ms, &(x[WS(rs, 1)]));
Chris@82 445 ST(&(x[WS(rs, 29)]), VFMAI(T2I, T2H), ms, &(x[WS(rs, 1)]));
Chris@82 446 T2L = VFMA(LDK(KP831469612), T2K, T2J);
Chris@82 447 T2O = VFNMS(LDK(KP831469612), T2N, T2M);
Chris@82 448 ST(&(x[WS(rs, 11)]), VFNMSI(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@82 449 ST(&(x[WS(rs, 21)]), VFMAI(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@82 450 }
Chris@82 451 }
Chris@82 452 }
Chris@82 453 }
Chris@82 454 }
Chris@82 455 VLEAVE();
Chris@82 456 }
Chris@82 457
Chris@82 458 static const tw_instr twinstr[] = {
Chris@82 459 VTW(0, 1),
Chris@82 460 VTW(0, 3),
Chris@82 461 VTW(0, 9),
Chris@82 462 VTW(0, 27),
Chris@82 463 {TW_NEXT, VL, 0}
Chris@82 464 };
Chris@82 465
Chris@82 466 static const ct_desc desc = { 32, XSIMD_STRING("t3bv_32"), twinstr, &GENUS, {146, 116, 98, 0}, 0, 0, 0 };
Chris@82 467
Chris@82 468 void XSIMD(codelet_t3bv_32) (planner *p) {
Chris@82 469 X(kdft_dit_register) (p, t3bv_32, &desc);
Chris@82 470 }
Chris@82 471 #else
Chris@82 472
Chris@82 473 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3bv_32 -include dft/simd/t3b.h -sign 1 */
Chris@82 474
Chris@82 475 /*
Chris@82 476 * This function contains 244 FP additions, 158 FP multiplications,
Chris@82 477 * (or, 228 additions, 142 multiplications, 16 fused multiply/add),
Chris@82 478 * 90 stack variables, 7 constants, and 64 memory accesses
Chris@82 479 */
Chris@82 480 #include "dft/simd/t3b.h"
Chris@82 481
Chris@82 482 static void t3bv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 483 {
Chris@82 484 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 485 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 486 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 487 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 488 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 489 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 490 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 491 {
Chris@82 492 INT m;
Chris@82 493 R *x;
Chris@82 494 x = ii;
Chris@82 495 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 496 V T2, T5, T3, T4, Tc, T1v, TH, Tz, Tn, T6, TS, Tf, TK, T7, T8;
Chris@82 497 V Tv, T1I, T25, Tg, Tk, T1N, T1Q, TC, T16, T12, T1w, TL, TP, TT, T1m;
Chris@82 498 V T1f;
Chris@82 499 T2 = LDW(&(W[0]));
Chris@82 500 T5 = LDW(&(W[TWVL * 4]));
Chris@82 501 T3 = LDW(&(W[TWVL * 2]));
Chris@82 502 T4 = VZMULJ(T2, T3);
Chris@82 503 Tc = VZMUL(T2, T3);
Chris@82 504 T1v = VZMULJ(T2, T5);
Chris@82 505 TH = VZMULJ(T3, T5);
Chris@82 506 Tz = VZMUL(T2, T5);
Chris@82 507 Tn = VZMUL(T3, T5);
Chris@82 508 T6 = VZMUL(T4, T5);
Chris@82 509 TS = VZMUL(Tc, T5);
Chris@82 510 Tf = VZMULJ(T4, T5);
Chris@82 511 TK = VZMULJ(Tc, T5);
Chris@82 512 T7 = LDW(&(W[TWVL * 6]));
Chris@82 513 T8 = VZMULJ(T6, T7);
Chris@82 514 Tv = VZMULJ(T5, T7);
Chris@82 515 T1I = VZMULJ(Tc, T7);
Chris@82 516 T25 = VZMULJ(T3, T7);
Chris@82 517 Tg = VZMULJ(Tf, T7);
Chris@82 518 Tk = VZMUL(T2, T7);
Chris@82 519 T1N = VZMUL(Tc, T7);
Chris@82 520 T1Q = VZMULJ(Tn, T7);
Chris@82 521 TC = VZMULJ(T2, T7);
Chris@82 522 T16 = VZMUL(T4, T7);
Chris@82 523 T12 = VZMULJ(TH, T7);
Chris@82 524 T1w = VZMULJ(T1v, T7);
Chris@82 525 TL = VZMULJ(TK, T7);
Chris@82 526 TP = VZMUL(T3, T7);
Chris@82 527 TT = VZMULJ(TS, T7);
Chris@82 528 T1m = VZMULJ(Tz, T7);
Chris@82 529 T1f = VZMULJ(T4, T7);
Chris@82 530 {
Chris@82 531 V Tb, T28, T3k, T3M, Tr, T22, T3f, T3N, TX, T20, T3b, T3J, TG, T1Z, T38;
Chris@82 532 V T3I, T1M, T2v, T33, T3F, T1V, T2w, T30, T3E, T1j, T2s, T2W, T3C, T1r, T2t;
Chris@82 533 V T2T, T3B;
Chris@82 534 {
Chris@82 535 V T1, T27, Ta, T24, T26, T9, T23, T3i, T3j;
Chris@82 536 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 537 T26 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@82 538 T27 = VZMUL(T25, T26);
Chris@82 539 T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 540 Ta = VZMUL(T8, T9);
Chris@82 541 T23 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 542 T24 = VZMUL(T1v, T23);
Chris@82 543 Tb = VSUB(T1, Ta);
Chris@82 544 T28 = VSUB(T24, T27);
Chris@82 545 T3i = VADD(T1, Ta);
Chris@82 546 T3j = VADD(T24, T27);
Chris@82 547 T3k = VSUB(T3i, T3j);
Chris@82 548 T3M = VADD(T3i, T3j);
Chris@82 549 }
Chris@82 550 {
Chris@82 551 V Te, Tp, Ti, Tm;
Chris@82 552 {
Chris@82 553 V Td, To, Th, Tl;
Chris@82 554 Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 555 Te = VZMUL(Tc, Td);
Chris@82 556 To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 557 Tp = VZMUL(Tn, To);
Chris@82 558 Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@82 559 Ti = VZMUL(Tg, Th);
Chris@82 560 Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@82 561 Tm = VZMUL(Tk, Tl);
Chris@82 562 }
Chris@82 563 {
Chris@82 564 V Tj, Tq, T3d, T3e;
Chris@82 565 Tj = VSUB(Te, Ti);
Chris@82 566 Tq = VSUB(Tm, Tp);
Chris@82 567 Tr = VMUL(LDK(KP707106781), VADD(Tj, Tq));
Chris@82 568 T22 = VMUL(LDK(KP707106781), VSUB(Tj, Tq));
Chris@82 569 T3d = VADD(Te, Ti);
Chris@82 570 T3e = VADD(Tm, Tp);
Chris@82 571 T3f = VSUB(T3d, T3e);
Chris@82 572 T3N = VADD(T3d, T3e);
Chris@82 573 }
Chris@82 574 }
Chris@82 575 {
Chris@82 576 V TJ, TV, TN, TR;
Chris@82 577 {
Chris@82 578 V TI, TU, TM, TQ;
Chris@82 579 TI = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 580 TJ = VZMUL(TH, TI);
Chris@82 581 TU = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 582 TV = VZMUL(TT, TU);
Chris@82 583 TM = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@82 584 TN = VZMUL(TL, TM);
Chris@82 585 TQ = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@82 586 TR = VZMUL(TP, TQ);
Chris@82 587 }
Chris@82 588 {
Chris@82 589 V TO, TW, T39, T3a;
Chris@82 590 TO = VSUB(TJ, TN);
Chris@82 591 TW = VSUB(TR, TV);
Chris@82 592 TX = VFNMS(LDK(KP382683432), TW, VMUL(LDK(KP923879532), TO));
Chris@82 593 T20 = VFMA(LDK(KP923879532), TW, VMUL(LDK(KP382683432), TO));
Chris@82 594 T39 = VADD(TR, TV);
Chris@82 595 T3a = VADD(TJ, TN);
Chris@82 596 T3b = VSUB(T39, T3a);
Chris@82 597 T3J = VADD(T39, T3a);
Chris@82 598 }
Chris@82 599 }
Chris@82 600 {
Chris@82 601 V Tu, TE, Tx, TB;
Chris@82 602 {
Chris@82 603 V Tt, TD, Tw, TA;
Chris@82 604 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 605 Tu = VZMUL(T4, Tt);
Chris@82 606 TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@82 607 TE = VZMUL(TC, TD);
Chris@82 608 Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 609 Tx = VZMUL(Tv, Tw);
Chris@82 610 TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 611 TB = VZMUL(Tz, TA);
Chris@82 612 }
Chris@82 613 {
Chris@82 614 V Ty, TF, T36, T37;
Chris@82 615 Ty = VSUB(Tu, Tx);
Chris@82 616 TF = VSUB(TB, TE);
Chris@82 617 TG = VFMA(LDK(KP382683432), Ty, VMUL(LDK(KP923879532), TF));
Chris@82 618 T1Z = VFNMS(LDK(KP382683432), TF, VMUL(LDK(KP923879532), Ty));
Chris@82 619 T36 = VADD(Tu, Tx);
Chris@82 620 T37 = VADD(TB, TE);
Chris@82 621 T38 = VSUB(T36, T37);
Chris@82 622 T3I = VADD(T36, T37);
Chris@82 623 }
Chris@82 624 }
Chris@82 625 {
Chris@82 626 V T1H, T1K, T1S, T1P, T1B, T1D, T1E, T1u, T1y, T1z;
Chris@82 627 {
Chris@82 628 V T1G, T1J, T1R, T1O;
Chris@82 629 T1G = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 630 T1H = VZMUL(Tf, T1G);
Chris@82 631 T1J = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@82 632 T1K = VZMUL(T1I, T1J);
Chris@82 633 T1R = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 634 T1S = VZMUL(T1Q, T1R);
Chris@82 635 T1O = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@82 636 T1P = VZMUL(T1N, T1O);
Chris@82 637 {
Chris@82 638 V T1A, T1C, T1t, T1x;
Chris@82 639 T1A = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@82 640 T1B = VZMUL(T7, T1A);
Chris@82 641 T1C = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 642 T1D = VZMUL(T6, T1C);
Chris@82 643 T1E = VSUB(T1B, T1D);
Chris@82 644 T1t = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 645 T1u = VZMUL(T3, T1t);
Chris@82 646 T1x = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 647 T1y = VZMUL(T1w, T1x);
Chris@82 648 T1z = VSUB(T1u, T1y);
Chris@82 649 }
Chris@82 650 }
Chris@82 651 {
Chris@82 652 V T1F, T1L, T31, T32;
Chris@82 653 T1F = VMUL(LDK(KP707106781), VSUB(T1z, T1E));
Chris@82 654 T1L = VSUB(T1H, T1K);
Chris@82 655 T1M = VSUB(T1F, T1L);
Chris@82 656 T2v = VADD(T1L, T1F);
Chris@82 657 T31 = VADD(T1u, T1y);
Chris@82 658 T32 = VADD(T1B, T1D);
Chris@82 659 T33 = VSUB(T31, T32);
Chris@82 660 T3F = VADD(T31, T32);
Chris@82 661 }
Chris@82 662 {
Chris@82 663 V T1T, T1U, T2Y, T2Z;
Chris@82 664 T1T = VSUB(T1P, T1S);
Chris@82 665 T1U = VMUL(LDK(KP707106781), VADD(T1z, T1E));
Chris@82 666 T1V = VSUB(T1T, T1U);
Chris@82 667 T2w = VADD(T1T, T1U);
Chris@82 668 T2Y = VADD(T1P, T1S);
Chris@82 669 T2Z = VADD(T1H, T1K);
Chris@82 670 T30 = VSUB(T2Y, T2Z);
Chris@82 671 T3E = VADD(T2Y, T2Z);
Chris@82 672 }
Chris@82 673 }
Chris@82 674 {
Chris@82 675 V T1e, T1h, T1o, T1l, T18, T1a, T1b, T11, T14, T15;
Chris@82 676 {
Chris@82 677 V T1d, T1g, T1n, T1k;
Chris@82 678 T1d = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 679 T1e = VZMUL(T5, T1d);
Chris@82 680 T1g = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@82 681 T1h = VZMUL(T1f, T1g);
Chris@82 682 T1n = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 683 T1o = VZMUL(T1m, T1n);
Chris@82 684 T1k = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 685 T1l = VZMUL(T2, T1k);
Chris@82 686 {
Chris@82 687 V T17, T19, T10, T13;
Chris@82 688 T17 = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@82 689 T18 = VZMUL(T16, T17);
Chris@82 690 T19 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 691 T1a = VZMUL(TS, T19);
Chris@82 692 T1b = VSUB(T18, T1a);
Chris@82 693 T10 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 694 T11 = VZMUL(TK, T10);
Chris@82 695 T13 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@82 696 T14 = VZMUL(T12, T13);
Chris@82 697 T15 = VSUB(T11, T14);
Chris@82 698 }
Chris@82 699 }
Chris@82 700 {
Chris@82 701 V T1c, T1i, T2U, T2V;
Chris@82 702 T1c = VMUL(LDK(KP707106781), VSUB(T15, T1b));
Chris@82 703 T1i = VSUB(T1e, T1h);
Chris@82 704 T1j = VSUB(T1c, T1i);
Chris@82 705 T2s = VADD(T1i, T1c);
Chris@82 706 T2U = VADD(T11, T14);
Chris@82 707 T2V = VADD(T18, T1a);
Chris@82 708 T2W = VSUB(T2U, T2V);
Chris@82 709 T3C = VADD(T2U, T2V);
Chris@82 710 }
Chris@82 711 {
Chris@82 712 V T1p, T1q, T2R, T2S;
Chris@82 713 T1p = VSUB(T1l, T1o);
Chris@82 714 T1q = VMUL(LDK(KP707106781), VADD(T15, T1b));
Chris@82 715 T1r = VSUB(T1p, T1q);
Chris@82 716 T2t = VADD(T1p, T1q);
Chris@82 717 T2R = VADD(T1l, T1o);
Chris@82 718 T2S = VADD(T1e, T1h);
Chris@82 719 T2T = VSUB(T2R, T2S);
Chris@82 720 T3B = VADD(T2R, T2S);
Chris@82 721 }
Chris@82 722 }
Chris@82 723 {
Chris@82 724 V T3V, T3Z, T3Y, T40;
Chris@82 725 {
Chris@82 726 V T3T, T3U, T3W, T3X;
Chris@82 727 T3T = VADD(T3M, T3N);
Chris@82 728 T3U = VADD(T3I, T3J);
Chris@82 729 T3V = VSUB(T3T, T3U);
Chris@82 730 T3Z = VADD(T3T, T3U);
Chris@82 731 T3W = VADD(T3B, T3C);
Chris@82 732 T3X = VADD(T3E, T3F);
Chris@82 733 T3Y = VBYI(VSUB(T3W, T3X));
Chris@82 734 T40 = VADD(T3W, T3X);
Chris@82 735 }
Chris@82 736 ST(&(x[WS(rs, 24)]), VSUB(T3V, T3Y), ms, &(x[0]));
Chris@82 737 ST(&(x[0]), VADD(T3Z, T40), ms, &(x[0]));
Chris@82 738 ST(&(x[WS(rs, 8)]), VADD(T3V, T3Y), ms, &(x[0]));
Chris@82 739 ST(&(x[WS(rs, 16)]), VSUB(T3Z, T40), ms, &(x[0]));
Chris@82 740 }
Chris@82 741 {
Chris@82 742 V T3K, T3O, T3H, T3P, T3D, T3G;
Chris@82 743 T3K = VSUB(T3I, T3J);
Chris@82 744 T3O = VSUB(T3M, T3N);
Chris@82 745 T3D = VSUB(T3B, T3C);
Chris@82 746 T3G = VSUB(T3E, T3F);
Chris@82 747 T3H = VMUL(LDK(KP707106781), VSUB(T3D, T3G));
Chris@82 748 T3P = VMUL(LDK(KP707106781), VADD(T3D, T3G));
Chris@82 749 {
Chris@82 750 V T3L, T3Q, T3R, T3S;
Chris@82 751 T3L = VBYI(VSUB(T3H, T3K));
Chris@82 752 T3Q = VSUB(T3O, T3P);
Chris@82 753 ST(&(x[WS(rs, 12)]), VADD(T3L, T3Q), ms, &(x[0]));
Chris@82 754 ST(&(x[WS(rs, 20)]), VSUB(T3Q, T3L), ms, &(x[0]));
Chris@82 755 T3R = VBYI(VADD(T3K, T3H));
Chris@82 756 T3S = VADD(T3O, T3P);
Chris@82 757 ST(&(x[WS(rs, 4)]), VADD(T3R, T3S), ms, &(x[0]));
Chris@82 758 ST(&(x[WS(rs, 28)]), VSUB(T3S, T3R), ms, &(x[0]));
Chris@82 759 }
Chris@82 760 }
Chris@82 761 {
Chris@82 762 V T3g, T3w, T3m, T3t, T35, T3u, T3p, T3x, T3c, T3l;
Chris@82 763 T3c = VMUL(LDK(KP707106781), VSUB(T38, T3b));
Chris@82 764 T3g = VSUB(T3c, T3f);
Chris@82 765 T3w = VADD(T3f, T3c);
Chris@82 766 T3l = VMUL(LDK(KP707106781), VADD(T38, T3b));
Chris@82 767 T3m = VSUB(T3k, T3l);
Chris@82 768 T3t = VADD(T3k, T3l);
Chris@82 769 {
Chris@82 770 V T2X, T34, T3n, T3o;
Chris@82 771 T2X = VFNMS(LDK(KP382683432), T2W, VMUL(LDK(KP923879532), T2T));
Chris@82 772 T34 = VFMA(LDK(KP923879532), T30, VMUL(LDK(KP382683432), T33));
Chris@82 773 T35 = VSUB(T2X, T34);
Chris@82 774 T3u = VADD(T2X, T34);
Chris@82 775 T3n = VFMA(LDK(KP382683432), T2T, VMUL(LDK(KP923879532), T2W));
Chris@82 776 T3o = VFNMS(LDK(KP382683432), T30, VMUL(LDK(KP923879532), T33));
Chris@82 777 T3p = VSUB(T3n, T3o);
Chris@82 778 T3x = VADD(T3n, T3o);
Chris@82 779 }
Chris@82 780 {
Chris@82 781 V T3h, T3q, T3z, T3A;
Chris@82 782 T3h = VBYI(VSUB(T35, T3g));
Chris@82 783 T3q = VSUB(T3m, T3p);
Chris@82 784 ST(&(x[WS(rs, 10)]), VADD(T3h, T3q), ms, &(x[0]));
Chris@82 785 ST(&(x[WS(rs, 22)]), VSUB(T3q, T3h), ms, &(x[0]));
Chris@82 786 T3z = VSUB(T3t, T3u);
Chris@82 787 T3A = VBYI(VSUB(T3x, T3w));
Chris@82 788 ST(&(x[WS(rs, 18)]), VSUB(T3z, T3A), ms, &(x[0]));
Chris@82 789 ST(&(x[WS(rs, 14)]), VADD(T3z, T3A), ms, &(x[0]));
Chris@82 790 }
Chris@82 791 {
Chris@82 792 V T3r, T3s, T3v, T3y;
Chris@82 793 T3r = VBYI(VADD(T3g, T35));
Chris@82 794 T3s = VADD(T3m, T3p);
Chris@82 795 ST(&(x[WS(rs, 6)]), VADD(T3r, T3s), ms, &(x[0]));
Chris@82 796 ST(&(x[WS(rs, 26)]), VSUB(T3s, T3r), ms, &(x[0]));
Chris@82 797 T3v = VADD(T3t, T3u);
Chris@82 798 T3y = VBYI(VADD(T3w, T3x));
Chris@82 799 ST(&(x[WS(rs, 30)]), VSUB(T3v, T3y), ms, &(x[0]));
Chris@82 800 ST(&(x[WS(rs, 2)]), VADD(T3v, T3y), ms, &(x[0]));
Chris@82 801 }
Chris@82 802 }
Chris@82 803 {
Chris@82 804 V TZ, T2k, T2d, T2l, T1X, T2h, T2a, T2i;
Chris@82 805 {
Chris@82 806 V Ts, TY, T2b, T2c;
Chris@82 807 Ts = VSUB(Tb, Tr);
Chris@82 808 TY = VSUB(TG, TX);
Chris@82 809 TZ = VSUB(Ts, TY);
Chris@82 810 T2k = VADD(Ts, TY);
Chris@82 811 T2b = VFNMS(LDK(KP555570233), T1j, VMUL(LDK(KP831469612), T1r));
Chris@82 812 T2c = VFMA(LDK(KP555570233), T1M, VMUL(LDK(KP831469612), T1V));
Chris@82 813 T2d = VSUB(T2b, T2c);
Chris@82 814 T2l = VADD(T2b, T2c);
Chris@82 815 }
Chris@82 816 {
Chris@82 817 V T1s, T1W, T21, T29;
Chris@82 818 T1s = VFMA(LDK(KP831469612), T1j, VMUL(LDK(KP555570233), T1r));
Chris@82 819 T1W = VFNMS(LDK(KP555570233), T1V, VMUL(LDK(KP831469612), T1M));
Chris@82 820 T1X = VSUB(T1s, T1W);
Chris@82 821 T2h = VADD(T1s, T1W);
Chris@82 822 T21 = VSUB(T1Z, T20);
Chris@82 823 T29 = VSUB(T22, T28);
Chris@82 824 T2a = VSUB(T21, T29);
Chris@82 825 T2i = VADD(T29, T21);
Chris@82 826 }
Chris@82 827 {
Chris@82 828 V T1Y, T2e, T2n, T2o;
Chris@82 829 T1Y = VADD(TZ, T1X);
Chris@82 830 T2e = VBYI(VADD(T2a, T2d));
Chris@82 831 ST(&(x[WS(rs, 27)]), VSUB(T1Y, T2e), ms, &(x[WS(rs, 1)]));
Chris@82 832 ST(&(x[WS(rs, 5)]), VADD(T1Y, T2e), ms, &(x[WS(rs, 1)]));
Chris@82 833 T2n = VBYI(VADD(T2i, T2h));
Chris@82 834 T2o = VADD(T2k, T2l);
Chris@82 835 ST(&(x[WS(rs, 3)]), VADD(T2n, T2o), ms, &(x[WS(rs, 1)]));
Chris@82 836 ST(&(x[WS(rs, 29)]), VSUB(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@82 837 }
Chris@82 838 {
Chris@82 839 V T2f, T2g, T2j, T2m;
Chris@82 840 T2f = VSUB(TZ, T1X);
Chris@82 841 T2g = VBYI(VSUB(T2d, T2a));
Chris@82 842 ST(&(x[WS(rs, 21)]), VSUB(T2f, T2g), ms, &(x[WS(rs, 1)]));
Chris@82 843 ST(&(x[WS(rs, 11)]), VADD(T2f, T2g), ms, &(x[WS(rs, 1)]));
Chris@82 844 T2j = VBYI(VSUB(T2h, T2i));
Chris@82 845 T2m = VSUB(T2k, T2l);
Chris@82 846 ST(&(x[WS(rs, 13)]), VADD(T2j, T2m), ms, &(x[WS(rs, 1)]));
Chris@82 847 ST(&(x[WS(rs, 19)]), VSUB(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@82 848 }
Chris@82 849 }
Chris@82 850 {
Chris@82 851 V T2r, T2M, T2F, T2N, T2y, T2J, T2C, T2K;
Chris@82 852 {
Chris@82 853 V T2p, T2q, T2D, T2E;
Chris@82 854 T2p = VADD(Tb, Tr);
Chris@82 855 T2q = VADD(T1Z, T20);
Chris@82 856 T2r = VSUB(T2p, T2q);
Chris@82 857 T2M = VADD(T2p, T2q);
Chris@82 858 T2D = VFNMS(LDK(KP195090322), T2s, VMUL(LDK(KP980785280), T2t));
Chris@82 859 T2E = VFMA(LDK(KP195090322), T2v, VMUL(LDK(KP980785280), T2w));
Chris@82 860 T2F = VSUB(T2D, T2E);
Chris@82 861 T2N = VADD(T2D, T2E);
Chris@82 862 }
Chris@82 863 {
Chris@82 864 V T2u, T2x, T2A, T2B;
Chris@82 865 T2u = VFMA(LDK(KP980785280), T2s, VMUL(LDK(KP195090322), T2t));
Chris@82 866 T2x = VFNMS(LDK(KP195090322), T2w, VMUL(LDK(KP980785280), T2v));
Chris@82 867 T2y = VSUB(T2u, T2x);
Chris@82 868 T2J = VADD(T2u, T2x);
Chris@82 869 T2A = VADD(TG, TX);
Chris@82 870 T2B = VADD(T28, T22);
Chris@82 871 T2C = VSUB(T2A, T2B);
Chris@82 872 T2K = VADD(T2B, T2A);
Chris@82 873 }
Chris@82 874 {
Chris@82 875 V T2z, T2G, T2P, T2Q;
Chris@82 876 T2z = VADD(T2r, T2y);
Chris@82 877 T2G = VBYI(VADD(T2C, T2F));
Chris@82 878 ST(&(x[WS(rs, 25)]), VSUB(T2z, T2G), ms, &(x[WS(rs, 1)]));
Chris@82 879 ST(&(x[WS(rs, 7)]), VADD(T2z, T2G), ms, &(x[WS(rs, 1)]));
Chris@82 880 T2P = VBYI(VADD(T2K, T2J));
Chris@82 881 T2Q = VADD(T2M, T2N);
Chris@82 882 ST(&(x[WS(rs, 1)]), VADD(T2P, T2Q), ms, &(x[WS(rs, 1)]));
Chris@82 883 ST(&(x[WS(rs, 31)]), VSUB(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@82 884 }
Chris@82 885 {
Chris@82 886 V T2H, T2I, T2L, T2O;
Chris@82 887 T2H = VSUB(T2r, T2y);
Chris@82 888 T2I = VBYI(VSUB(T2F, T2C));
Chris@82 889 ST(&(x[WS(rs, 23)]), VSUB(T2H, T2I), ms, &(x[WS(rs, 1)]));
Chris@82 890 ST(&(x[WS(rs, 9)]), VADD(T2H, T2I), ms, &(x[WS(rs, 1)]));
Chris@82 891 T2L = VBYI(VSUB(T2J, T2K));
Chris@82 892 T2O = VSUB(T2M, T2N);
Chris@82 893 ST(&(x[WS(rs, 15)]), VADD(T2L, T2O), ms, &(x[WS(rs, 1)]));
Chris@82 894 ST(&(x[WS(rs, 17)]), VSUB(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@82 895 }
Chris@82 896 }
Chris@82 897 }
Chris@82 898 }
Chris@82 899 }
Chris@82 900 VLEAVE();
Chris@82 901 }
Chris@82 902
Chris@82 903 static const tw_instr twinstr[] = {
Chris@82 904 VTW(0, 1),
Chris@82 905 VTW(0, 3),
Chris@82 906 VTW(0, 9),
Chris@82 907 VTW(0, 27),
Chris@82 908 {TW_NEXT, VL, 0}
Chris@82 909 };
Chris@82 910
Chris@82 911 static const ct_desc desc = { 32, XSIMD_STRING("t3bv_32"), twinstr, &GENUS, {228, 142, 16, 0}, 0, 0, 0 };
Chris@82 912
Chris@82 913 void XSIMD(codelet_t3bv_32) (planner *p) {
Chris@82 914 X(kdft_dit_register) (p, t3bv_32, &desc);
Chris@82 915 }
Chris@82 916 #endif