annotate src/fftw-3.3.8/dft/simd/common/t2bv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:05 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2bv_20 -include dft/simd/t2b.h -sign 1 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 123 FP additions, 88 FP multiplications,
Chris@82 32 * (or, 77 additions, 42 multiplications, 46 fused multiply/add),
Chris@82 33 * 54 stack variables, 4 constants, and 40 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t2b.h"
Chris@82 36
Chris@82 37 static void t2bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 R *x;
Chris@82 46 x = ii;
Chris@82 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@82 48 V T4, TX, T1m, T1K, TF, T14, T15, TQ, Tf, Tq, Tr, T1O, T1P, T1Q, T1w;
Chris@82 49 V T1z, T1A, TY, TZ, T10, T1L, T1M, T1N, T1p, T1s, T1t, T1i, T1j;
Chris@82 50 {
Chris@82 51 V T1, TW, T3, TU, TV, T2, TT, T1k, T1l;
Chris@82 52 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 53 TV = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 54 TW = BYTW(&(W[TWVL * 28]), TV);
Chris@82 55 T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 56 T3 = BYTW(&(W[TWVL * 18]), T2);
Chris@82 57 TT = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 58 TU = BYTW(&(W[TWVL * 8]), TT);
Chris@82 59 T4 = VSUB(T1, T3);
Chris@82 60 TX = VSUB(TU, TW);
Chris@82 61 T1k = VADD(T1, T3);
Chris@82 62 T1l = VADD(TU, TW);
Chris@82 63 T1m = VSUB(T1k, T1l);
Chris@82 64 T1K = VADD(T1k, T1l);
Chris@82 65 }
Chris@82 66 {
Chris@82 67 V T9, T1n, TK, T1v, TP, T1y, Te, T1q, Tk, T1u, Tz, T1o, TE, T1r, Tp;
Chris@82 68 V T1x;
Chris@82 69 {
Chris@82 70 V T6, T8, T5, T7;
Chris@82 71 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 72 T6 = BYTW(&(W[TWVL * 6]), T5);
Chris@82 73 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 74 T8 = BYTW(&(W[TWVL * 26]), T7);
Chris@82 75 T9 = VSUB(T6, T8);
Chris@82 76 T1n = VADD(T6, T8);
Chris@82 77 }
Chris@82 78 {
Chris@82 79 V TH, TJ, TG, TI;
Chris@82 80 TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 81 TH = BYTW(&(W[TWVL * 24]), TG);
Chris@82 82 TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 83 TJ = BYTW(&(W[TWVL * 4]), TI);
Chris@82 84 TK = VSUB(TH, TJ);
Chris@82 85 T1v = VADD(TH, TJ);
Chris@82 86 }
Chris@82 87 {
Chris@82 88 V TM, TO, TL, TN;
Chris@82 89 TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 90 TM = BYTW(&(W[TWVL * 32]), TL);
Chris@82 91 TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 92 TO = BYTW(&(W[TWVL * 12]), TN);
Chris@82 93 TP = VSUB(TM, TO);
Chris@82 94 T1y = VADD(TM, TO);
Chris@82 95 }
Chris@82 96 {
Chris@82 97 V Tb, Td, Ta, Tc;
Chris@82 98 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 99 Tb = BYTW(&(W[TWVL * 30]), Ta);
Chris@82 100 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 101 Td = BYTW(&(W[TWVL * 10]), Tc);
Chris@82 102 Te = VSUB(Tb, Td);
Chris@82 103 T1q = VADD(Tb, Td);
Chris@82 104 }
Chris@82 105 {
Chris@82 106 V Th, Tj, Tg, Ti;
Chris@82 107 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 108 Th = BYTW(&(W[TWVL * 14]), Tg);
Chris@82 109 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 110 Tj = BYTW(&(W[TWVL * 34]), Ti);
Chris@82 111 Tk = VSUB(Th, Tj);
Chris@82 112 T1u = VADD(Th, Tj);
Chris@82 113 }
Chris@82 114 {
Chris@82 115 V Tw, Ty, Tv, Tx;
Chris@82 116 Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 117 Tw = BYTW(&(W[TWVL * 16]), Tv);
Chris@82 118 Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 119 Ty = BYTW(&(W[TWVL * 36]), Tx);
Chris@82 120 Tz = VSUB(Tw, Ty);
Chris@82 121 T1o = VADD(Tw, Ty);
Chris@82 122 }
Chris@82 123 {
Chris@82 124 V TB, TD, TA, TC;
Chris@82 125 TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 126 TB = BYTW(&(W[0]), TA);
Chris@82 127 TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 128 TD = BYTW(&(W[TWVL * 20]), TC);
Chris@82 129 TE = VSUB(TB, TD);
Chris@82 130 T1r = VADD(TB, TD);
Chris@82 131 }
Chris@82 132 {
Chris@82 133 V Tm, To, Tl, Tn;
Chris@82 134 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 135 Tm = BYTW(&(W[TWVL * 22]), Tl);
Chris@82 136 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 137 To = BYTW(&(W[TWVL * 2]), Tn);
Chris@82 138 Tp = VSUB(Tm, To);
Chris@82 139 T1x = VADD(Tm, To);
Chris@82 140 }
Chris@82 141 TF = VSUB(Tz, TE);
Chris@82 142 T14 = VSUB(T9, Te);
Chris@82 143 T15 = VSUB(Tk, Tp);
Chris@82 144 TQ = VSUB(TK, TP);
Chris@82 145 Tf = VADD(T9, Te);
Chris@82 146 Tq = VADD(Tk, Tp);
Chris@82 147 Tr = VADD(Tf, Tq);
Chris@82 148 T1O = VADD(T1u, T1v);
Chris@82 149 T1P = VADD(T1x, T1y);
Chris@82 150 T1Q = VADD(T1O, T1P);
Chris@82 151 T1w = VSUB(T1u, T1v);
Chris@82 152 T1z = VSUB(T1x, T1y);
Chris@82 153 T1A = VADD(T1w, T1z);
Chris@82 154 TY = VADD(Tz, TE);
Chris@82 155 TZ = VADD(TK, TP);
Chris@82 156 T10 = VADD(TY, TZ);
Chris@82 157 T1L = VADD(T1n, T1o);
Chris@82 158 T1M = VADD(T1q, T1r);
Chris@82 159 T1N = VADD(T1L, T1M);
Chris@82 160 T1p = VSUB(T1n, T1o);
Chris@82 161 T1s = VSUB(T1q, T1r);
Chris@82 162 T1t = VADD(T1p, T1s);
Chris@82 163 }
Chris@82 164 T1i = VADD(T4, Tr);
Chris@82 165 T1j = VADD(TX, T10);
Chris@82 166 ST(&(x[WS(rs, 15)]), VFNMSI(T1j, T1i), ms, &(x[WS(rs, 1)]));
Chris@82 167 ST(&(x[WS(rs, 5)]), VFMAI(T1j, T1i), ms, &(x[WS(rs, 1)]));
Chris@82 168 {
Chris@82 169 V T1T, T1R, T1S, T1X, T1Z, T1V, T1W, T1Y, T1U;
Chris@82 170 T1T = VSUB(T1N, T1Q);
Chris@82 171 T1R = VADD(T1N, T1Q);
Chris@82 172 T1S = VFNMS(LDK(KP250000000), T1R, T1K);
Chris@82 173 T1V = VSUB(T1L, T1M);
Chris@82 174 T1W = VSUB(T1O, T1P);
Chris@82 175 T1X = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1W, T1V));
Chris@82 176 T1Z = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1V, T1W));
Chris@82 177 ST(&(x[0]), VADD(T1K, T1R), ms, &(x[0]));
Chris@82 178 T1Y = VFNMS(LDK(KP559016994), T1T, T1S);
Chris@82 179 ST(&(x[WS(rs, 8)]), VFMAI(T1Z, T1Y), ms, &(x[0]));
Chris@82 180 ST(&(x[WS(rs, 12)]), VFNMSI(T1Z, T1Y), ms, &(x[0]));
Chris@82 181 T1U = VFMA(LDK(KP559016994), T1T, T1S);
Chris@82 182 ST(&(x[WS(rs, 4)]), VFNMSI(T1X, T1U), ms, &(x[0]));
Chris@82 183 ST(&(x[WS(rs, 16)]), VFMAI(T1X, T1U), ms, &(x[0]));
Chris@82 184 }
Chris@82 185 {
Chris@82 186 V T1D, T1B, T1C, T1H, T1J, T1F, T1G, T1I, T1E;
Chris@82 187 T1D = VSUB(T1t, T1A);
Chris@82 188 T1B = VADD(T1t, T1A);
Chris@82 189 T1C = VFNMS(LDK(KP250000000), T1B, T1m);
Chris@82 190 T1F = VSUB(T1w, T1z);
Chris@82 191 T1G = VSUB(T1p, T1s);
Chris@82 192 T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
Chris@82 193 T1J = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
Chris@82 194 ST(&(x[WS(rs, 10)]), VADD(T1m, T1B), ms, &(x[0]));
Chris@82 195 T1I = VFMA(LDK(KP559016994), T1D, T1C);
Chris@82 196 ST(&(x[WS(rs, 6)]), VFMAI(T1J, T1I), ms, &(x[0]));
Chris@82 197 ST(&(x[WS(rs, 14)]), VFNMSI(T1J, T1I), ms, &(x[0]));
Chris@82 198 T1E = VFNMS(LDK(KP559016994), T1D, T1C);
Chris@82 199 ST(&(x[WS(rs, 2)]), VFNMSI(T1H, T1E), ms, &(x[0]));
Chris@82 200 ST(&(x[WS(rs, 18)]), VFMAI(T1H, T1E), ms, &(x[0]));
Chris@82 201 }
Chris@82 202 {
Chris@82 203 V TR, T16, T1e, T1b, T13, T1d, Tu, T1a;
Chris@82 204 TR = VFMA(LDK(KP618033988), TQ, TF);
Chris@82 205 T16 = VFMA(LDK(KP618033988), T15, T14);
Chris@82 206 T1e = VFNMS(LDK(KP618033988), T14, T15);
Chris@82 207 T1b = VFNMS(LDK(KP618033988), TF, TQ);
Chris@82 208 {
Chris@82 209 V T11, T12, Ts, Tt;
Chris@82 210 T11 = VFNMS(LDK(KP250000000), T10, TX);
Chris@82 211 T12 = VSUB(TY, TZ);
Chris@82 212 T13 = VFMA(LDK(KP559016994), T12, T11);
Chris@82 213 T1d = VFNMS(LDK(KP559016994), T12, T11);
Chris@82 214 Ts = VFNMS(LDK(KP250000000), Tr, T4);
Chris@82 215 Tt = VSUB(Tf, Tq);
Chris@82 216 Tu = VFMA(LDK(KP559016994), Tt, Ts);
Chris@82 217 T1a = VFNMS(LDK(KP559016994), Tt, Ts);
Chris@82 218 }
Chris@82 219 {
Chris@82 220 V TS, T17, T1g, T1h;
Chris@82 221 TS = VFNMS(LDK(KP951056516), TR, Tu);
Chris@82 222 T17 = VFMA(LDK(KP951056516), T16, T13);
Chris@82 223 ST(&(x[WS(rs, 19)]), VFNMSI(T17, TS), ms, &(x[WS(rs, 1)]));
Chris@82 224 ST(&(x[WS(rs, 1)]), VFMAI(T17, TS), ms, &(x[WS(rs, 1)]));
Chris@82 225 T1g = VFNMS(LDK(KP951056516), T1b, T1a);
Chris@82 226 T1h = VFMA(LDK(KP951056516), T1e, T1d);
Chris@82 227 ST(&(x[WS(rs, 7)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
Chris@82 228 ST(&(x[WS(rs, 13)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
Chris@82 229 }
Chris@82 230 {
Chris@82 231 V T18, T19, T1c, T1f;
Chris@82 232 T18 = VFMA(LDK(KP951056516), TR, Tu);
Chris@82 233 T19 = VFNMS(LDK(KP951056516), T16, T13);
Chris@82 234 ST(&(x[WS(rs, 11)]), VFNMSI(T19, T18), ms, &(x[WS(rs, 1)]));
Chris@82 235 ST(&(x[WS(rs, 9)]), VFMAI(T19, T18), ms, &(x[WS(rs, 1)]));
Chris@82 236 T1c = VFMA(LDK(KP951056516), T1b, T1a);
Chris@82 237 T1f = VFNMS(LDK(KP951056516), T1e, T1d);
Chris@82 238 ST(&(x[WS(rs, 3)]), VFNMSI(T1f, T1c), ms, &(x[WS(rs, 1)]));
Chris@82 239 ST(&(x[WS(rs, 17)]), VFMAI(T1f, T1c), ms, &(x[WS(rs, 1)]));
Chris@82 240 }
Chris@82 241 }
Chris@82 242 }
Chris@82 243 }
Chris@82 244 VLEAVE();
Chris@82 245 }
Chris@82 246
Chris@82 247 static const tw_instr twinstr[] = {
Chris@82 248 VTW(0, 1),
Chris@82 249 VTW(0, 2),
Chris@82 250 VTW(0, 3),
Chris@82 251 VTW(0, 4),
Chris@82 252 VTW(0, 5),
Chris@82 253 VTW(0, 6),
Chris@82 254 VTW(0, 7),
Chris@82 255 VTW(0, 8),
Chris@82 256 VTW(0, 9),
Chris@82 257 VTW(0, 10),
Chris@82 258 VTW(0, 11),
Chris@82 259 VTW(0, 12),
Chris@82 260 VTW(0, 13),
Chris@82 261 VTW(0, 14),
Chris@82 262 VTW(0, 15),
Chris@82 263 VTW(0, 16),
Chris@82 264 VTW(0, 17),
Chris@82 265 VTW(0, 18),
Chris@82 266 VTW(0, 19),
Chris@82 267 {TW_NEXT, VL, 0}
Chris@82 268 };
Chris@82 269
Chris@82 270 static const ct_desc desc = { 20, XSIMD_STRING("t2bv_20"), twinstr, &GENUS, {77, 42, 46, 0}, 0, 0, 0 };
Chris@82 271
Chris@82 272 void XSIMD(codelet_t2bv_20) (planner *p) {
Chris@82 273 X(kdft_dit_register) (p, t2bv_20, &desc);
Chris@82 274 }
Chris@82 275 #else
Chris@82 276
Chris@82 277 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2bv_20 -include dft/simd/t2b.h -sign 1 */
Chris@82 278
Chris@82 279 /*
Chris@82 280 * This function contains 123 FP additions, 62 FP multiplications,
Chris@82 281 * (or, 111 additions, 50 multiplications, 12 fused multiply/add),
Chris@82 282 * 54 stack variables, 4 constants, and 40 memory accesses
Chris@82 283 */
Chris@82 284 #include "dft/simd/t2b.h"
Chris@82 285
Chris@82 286 static void t2bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 287 {
Chris@82 288 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 289 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 290 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 291 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 292 {
Chris@82 293 INT m;
Chris@82 294 R *x;
Chris@82 295 x = ii;
Chris@82 296 for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@82 297 V T4, T10, T1B, T1R, TF, T14, T15, TQ, Tf, Tq, Tr, T1N, T1O, T1P, T1t;
Chris@82 298 V T1w, T1D, TT, TU, T11, T1K, T1L, T1M, T1m, T1p, T1C, T1i, T1j;
Chris@82 299 {
Chris@82 300 V T1, TZ, T3, TX, TY, T2, TW, T1z, T1A;
Chris@82 301 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 302 TY = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 303 TZ = BYTW(&(W[TWVL * 28]), TY);
Chris@82 304 T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 305 T3 = BYTW(&(W[TWVL * 18]), T2);
Chris@82 306 TW = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 307 TX = BYTW(&(W[TWVL * 8]), TW);
Chris@82 308 T4 = VSUB(T1, T3);
Chris@82 309 T10 = VSUB(TX, TZ);
Chris@82 310 T1z = VADD(T1, T3);
Chris@82 311 T1A = VADD(TX, TZ);
Chris@82 312 T1B = VSUB(T1z, T1A);
Chris@82 313 T1R = VADD(T1z, T1A);
Chris@82 314 }
Chris@82 315 {
Chris@82 316 V T9, T1k, TK, T1s, TP, T1v, Te, T1n, Tk, T1r, Tz, T1l, TE, T1o, Tp;
Chris@82 317 V T1u;
Chris@82 318 {
Chris@82 319 V T6, T8, T5, T7;
Chris@82 320 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 321 T6 = BYTW(&(W[TWVL * 6]), T5);
Chris@82 322 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 323 T8 = BYTW(&(W[TWVL * 26]), T7);
Chris@82 324 T9 = VSUB(T6, T8);
Chris@82 325 T1k = VADD(T6, T8);
Chris@82 326 }
Chris@82 327 {
Chris@82 328 V TH, TJ, TG, TI;
Chris@82 329 TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 330 TH = BYTW(&(W[TWVL * 24]), TG);
Chris@82 331 TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 332 TJ = BYTW(&(W[TWVL * 4]), TI);
Chris@82 333 TK = VSUB(TH, TJ);
Chris@82 334 T1s = VADD(TH, TJ);
Chris@82 335 }
Chris@82 336 {
Chris@82 337 V TM, TO, TL, TN;
Chris@82 338 TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 339 TM = BYTW(&(W[TWVL * 32]), TL);
Chris@82 340 TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 341 TO = BYTW(&(W[TWVL * 12]), TN);
Chris@82 342 TP = VSUB(TM, TO);
Chris@82 343 T1v = VADD(TM, TO);
Chris@82 344 }
Chris@82 345 {
Chris@82 346 V Tb, Td, Ta, Tc;
Chris@82 347 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 348 Tb = BYTW(&(W[TWVL * 30]), Ta);
Chris@82 349 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 350 Td = BYTW(&(W[TWVL * 10]), Tc);
Chris@82 351 Te = VSUB(Tb, Td);
Chris@82 352 T1n = VADD(Tb, Td);
Chris@82 353 }
Chris@82 354 {
Chris@82 355 V Th, Tj, Tg, Ti;
Chris@82 356 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 357 Th = BYTW(&(W[TWVL * 14]), Tg);
Chris@82 358 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 359 Tj = BYTW(&(W[TWVL * 34]), Ti);
Chris@82 360 Tk = VSUB(Th, Tj);
Chris@82 361 T1r = VADD(Th, Tj);
Chris@82 362 }
Chris@82 363 {
Chris@82 364 V Tw, Ty, Tv, Tx;
Chris@82 365 Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 366 Tw = BYTW(&(W[TWVL * 16]), Tv);
Chris@82 367 Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 368 Ty = BYTW(&(W[TWVL * 36]), Tx);
Chris@82 369 Tz = VSUB(Tw, Ty);
Chris@82 370 T1l = VADD(Tw, Ty);
Chris@82 371 }
Chris@82 372 {
Chris@82 373 V TB, TD, TA, TC;
Chris@82 374 TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 375 TB = BYTW(&(W[0]), TA);
Chris@82 376 TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 377 TD = BYTW(&(W[TWVL * 20]), TC);
Chris@82 378 TE = VSUB(TB, TD);
Chris@82 379 T1o = VADD(TB, TD);
Chris@82 380 }
Chris@82 381 {
Chris@82 382 V Tm, To, Tl, Tn;
Chris@82 383 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 384 Tm = BYTW(&(W[TWVL * 22]), Tl);
Chris@82 385 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 386 To = BYTW(&(W[TWVL * 2]), Tn);
Chris@82 387 Tp = VSUB(Tm, To);
Chris@82 388 T1u = VADD(Tm, To);
Chris@82 389 }
Chris@82 390 TF = VSUB(Tz, TE);
Chris@82 391 T14 = VSUB(T9, Te);
Chris@82 392 T15 = VSUB(Tk, Tp);
Chris@82 393 TQ = VSUB(TK, TP);
Chris@82 394 Tf = VADD(T9, Te);
Chris@82 395 Tq = VADD(Tk, Tp);
Chris@82 396 Tr = VADD(Tf, Tq);
Chris@82 397 T1N = VADD(T1r, T1s);
Chris@82 398 T1O = VADD(T1u, T1v);
Chris@82 399 T1P = VADD(T1N, T1O);
Chris@82 400 T1t = VSUB(T1r, T1s);
Chris@82 401 T1w = VSUB(T1u, T1v);
Chris@82 402 T1D = VADD(T1t, T1w);
Chris@82 403 TT = VADD(Tz, TE);
Chris@82 404 TU = VADD(TK, TP);
Chris@82 405 T11 = VADD(TT, TU);
Chris@82 406 T1K = VADD(T1k, T1l);
Chris@82 407 T1L = VADD(T1n, T1o);
Chris@82 408 T1M = VADD(T1K, T1L);
Chris@82 409 T1m = VSUB(T1k, T1l);
Chris@82 410 T1p = VSUB(T1n, T1o);
Chris@82 411 T1C = VADD(T1m, T1p);
Chris@82 412 }
Chris@82 413 T1i = VADD(T4, Tr);
Chris@82 414 T1j = VBYI(VADD(T10, T11));
Chris@82 415 ST(&(x[WS(rs, 15)]), VSUB(T1i, T1j), ms, &(x[WS(rs, 1)]));
Chris@82 416 ST(&(x[WS(rs, 5)]), VADD(T1i, T1j), ms, &(x[WS(rs, 1)]));
Chris@82 417 {
Chris@82 418 V T1Q, T1S, T1T, T1X, T1Z, T1V, T1W, T1Y, T1U;
Chris@82 419 T1Q = VMUL(LDK(KP559016994), VSUB(T1M, T1P));
Chris@82 420 T1S = VADD(T1M, T1P);
Chris@82 421 T1T = VFNMS(LDK(KP250000000), T1S, T1R);
Chris@82 422 T1V = VSUB(T1K, T1L);
Chris@82 423 T1W = VSUB(T1N, T1O);
Chris@82 424 T1X = VBYI(VFMA(LDK(KP951056516), T1V, VMUL(LDK(KP587785252), T1W)));
Chris@82 425 T1Z = VBYI(VFNMS(LDK(KP951056516), T1W, VMUL(LDK(KP587785252), T1V)));
Chris@82 426 ST(&(x[0]), VADD(T1R, T1S), ms, &(x[0]));
Chris@82 427 T1Y = VSUB(T1T, T1Q);
Chris@82 428 ST(&(x[WS(rs, 8)]), VSUB(T1Y, T1Z), ms, &(x[0]));
Chris@82 429 ST(&(x[WS(rs, 12)]), VADD(T1Z, T1Y), ms, &(x[0]));
Chris@82 430 T1U = VADD(T1Q, T1T);
Chris@82 431 ST(&(x[WS(rs, 4)]), VSUB(T1U, T1X), ms, &(x[0]));
Chris@82 432 ST(&(x[WS(rs, 16)]), VADD(T1X, T1U), ms, &(x[0]));
Chris@82 433 }
Chris@82 434 {
Chris@82 435 V T1G, T1E, T1F, T1y, T1I, T1q, T1x, T1J, T1H;
Chris@82 436 T1G = VMUL(LDK(KP559016994), VSUB(T1C, T1D));
Chris@82 437 T1E = VADD(T1C, T1D);
Chris@82 438 T1F = VFNMS(LDK(KP250000000), T1E, T1B);
Chris@82 439 T1q = VSUB(T1m, T1p);
Chris@82 440 T1x = VSUB(T1t, T1w);
Chris@82 441 T1y = VBYI(VFNMS(LDK(KP951056516), T1x, VMUL(LDK(KP587785252), T1q)));
Chris@82 442 T1I = VBYI(VFMA(LDK(KP951056516), T1q, VMUL(LDK(KP587785252), T1x)));
Chris@82 443 ST(&(x[WS(rs, 10)]), VADD(T1B, T1E), ms, &(x[0]));
Chris@82 444 T1J = VADD(T1G, T1F);
Chris@82 445 ST(&(x[WS(rs, 6)]), VADD(T1I, T1J), ms, &(x[0]));
Chris@82 446 ST(&(x[WS(rs, 14)]), VSUB(T1J, T1I), ms, &(x[0]));
Chris@82 447 T1H = VSUB(T1F, T1G);
Chris@82 448 ST(&(x[WS(rs, 2)]), VADD(T1y, T1H), ms, &(x[0]));
Chris@82 449 ST(&(x[WS(rs, 18)]), VSUB(T1H, T1y), ms, &(x[0]));
Chris@82 450 }
Chris@82 451 {
Chris@82 452 V TR, T16, T1d, T1b, T13, T1e, Tu, T1a;
Chris@82 453 TR = VFNMS(LDK(KP951056516), TQ, VMUL(LDK(KP587785252), TF));
Chris@82 454 T16 = VFNMS(LDK(KP951056516), T15, VMUL(LDK(KP587785252), T14));
Chris@82 455 T1d = VFMA(LDK(KP951056516), T14, VMUL(LDK(KP587785252), T15));
Chris@82 456 T1b = VFMA(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TQ));
Chris@82 457 {
Chris@82 458 V TV, T12, Ts, Tt;
Chris@82 459 TV = VMUL(LDK(KP559016994), VSUB(TT, TU));
Chris@82 460 T12 = VFNMS(LDK(KP250000000), T11, T10);
Chris@82 461 T13 = VSUB(TV, T12);
Chris@82 462 T1e = VADD(TV, T12);
Chris@82 463 Ts = VFNMS(LDK(KP250000000), Tr, T4);
Chris@82 464 Tt = VMUL(LDK(KP559016994), VSUB(Tf, Tq));
Chris@82 465 Tu = VSUB(Ts, Tt);
Chris@82 466 T1a = VADD(Tt, Ts);
Chris@82 467 }
Chris@82 468 {
Chris@82 469 V TS, T17, T1g, T1h;
Chris@82 470 TS = VSUB(Tu, TR);
Chris@82 471 T17 = VBYI(VSUB(T13, T16));
Chris@82 472 ST(&(x[WS(rs, 17)]), VSUB(TS, T17), ms, &(x[WS(rs, 1)]));
Chris@82 473 ST(&(x[WS(rs, 3)]), VADD(TS, T17), ms, &(x[WS(rs, 1)]));
Chris@82 474 T1g = VADD(T1a, T1b);
Chris@82 475 T1h = VBYI(VSUB(T1e, T1d));
Chris@82 476 ST(&(x[WS(rs, 11)]), VSUB(T1g, T1h), ms, &(x[WS(rs, 1)]));
Chris@82 477 ST(&(x[WS(rs, 9)]), VADD(T1g, T1h), ms, &(x[WS(rs, 1)]));
Chris@82 478 }
Chris@82 479 {
Chris@82 480 V T18, T19, T1c, T1f;
Chris@82 481 T18 = VADD(Tu, TR);
Chris@82 482 T19 = VBYI(VADD(T16, T13));
Chris@82 483 ST(&(x[WS(rs, 13)]), VSUB(T18, T19), ms, &(x[WS(rs, 1)]));
Chris@82 484 ST(&(x[WS(rs, 7)]), VADD(T18, T19), ms, &(x[WS(rs, 1)]));
Chris@82 485 T1c = VSUB(T1a, T1b);
Chris@82 486 T1f = VBYI(VADD(T1d, T1e));
Chris@82 487 ST(&(x[WS(rs, 19)]), VSUB(T1c, T1f), ms, &(x[WS(rs, 1)]));
Chris@82 488 ST(&(x[WS(rs, 1)]), VADD(T1c, T1f), ms, &(x[WS(rs, 1)]));
Chris@82 489 }
Chris@82 490 }
Chris@82 491 }
Chris@82 492 }
Chris@82 493 VLEAVE();
Chris@82 494 }
Chris@82 495
Chris@82 496 static const tw_instr twinstr[] = {
Chris@82 497 VTW(0, 1),
Chris@82 498 VTW(0, 2),
Chris@82 499 VTW(0, 3),
Chris@82 500 VTW(0, 4),
Chris@82 501 VTW(0, 5),
Chris@82 502 VTW(0, 6),
Chris@82 503 VTW(0, 7),
Chris@82 504 VTW(0, 8),
Chris@82 505 VTW(0, 9),
Chris@82 506 VTW(0, 10),
Chris@82 507 VTW(0, 11),
Chris@82 508 VTW(0, 12),
Chris@82 509 VTW(0, 13),
Chris@82 510 VTW(0, 14),
Chris@82 511 VTW(0, 15),
Chris@82 512 VTW(0, 16),
Chris@82 513 VTW(0, 17),
Chris@82 514 VTW(0, 18),
Chris@82 515 VTW(0, 19),
Chris@82 516 {TW_NEXT, VL, 0}
Chris@82 517 };
Chris@82 518
Chris@82 519 static const ct_desc desc = { 20, XSIMD_STRING("t2bv_20"), twinstr, &GENUS, {111, 50, 12, 0}, 0, 0, 0 };
Chris@82 520
Chris@82 521 void XSIMD(codelet_t2bv_20) (planner *p) {
Chris@82 522 X(kdft_dit_register) (p, t2bv_20, &desc);
Chris@82 523 }
Chris@82 524 #endif