annotate src/fftw-3.3.5/dft/simd/common/t1bv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:44:25 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t1bv_20 -include t1b.h -sign 1 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 123 FP additions, 88 FP multiplications,
Chris@42 32 * (or, 77 additions, 42 multiplications, 46 fused multiply/add),
Chris@42 33 * 68 stack variables, 4 constants, and 40 memory accesses
Chris@42 34 */
Chris@42 35 #include "t1b.h"
Chris@42 36
Chris@42 37 static void t1bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 R *x;
Chris@42 46 x = ii;
Chris@42 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@42 48 V T4, TX, T1m, T1K, T1y, Tk, Tf, T14, TQ, TZ, T1O, T1w, T1L, T1p, T1M;
Chris@42 49 V T1s, TF, TY, T1x, Tp;
Chris@42 50 {
Chris@42 51 V T1, TV, T2, TT;
Chris@42 52 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 53 TV = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 54 T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 55 TT = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 56 {
Chris@42 57 V T9, T1n, TK, T1v, TP, Te, T1q, T1u, TB, TD, Tm, T1o, Tz, Tn, T1r;
Chris@42 58 V TE, To;
Chris@42 59 {
Chris@42 60 V TM, TO, Ta, Tc;
Chris@42 61 {
Chris@42 62 V T5, T7, TG, TI, T1k, T1l;
Chris@42 63 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 64 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 65 TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 66 TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 67 {
Chris@42 68 V TW, T3, TU, T6, T8, TH, TJ, TL, TN;
Chris@42 69 TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@42 70 TW = BYTW(&(W[TWVL * 28]), TV);
Chris@42 71 T3 = BYTW(&(W[TWVL * 18]), T2);
Chris@42 72 TU = BYTW(&(W[TWVL * 8]), TT);
Chris@42 73 T6 = BYTW(&(W[TWVL * 6]), T5);
Chris@42 74 T8 = BYTW(&(W[TWVL * 26]), T7);
Chris@42 75 TH = BYTW(&(W[TWVL * 24]), TG);
Chris@42 76 TJ = BYTW(&(W[TWVL * 4]), TI);
Chris@42 77 TM = BYTW(&(W[TWVL * 32]), TL);
Chris@42 78 TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 79 T4 = VSUB(T1, T3);
Chris@42 80 T1k = VADD(T1, T3);
Chris@42 81 TX = VSUB(TU, TW);
Chris@42 82 T1l = VADD(TU, TW);
Chris@42 83 T9 = VSUB(T6, T8);
Chris@42 84 T1n = VADD(T6, T8);
Chris@42 85 TK = VSUB(TH, TJ);
Chris@42 86 T1v = VADD(TH, TJ);
Chris@42 87 TO = BYTW(&(W[TWVL * 12]), TN);
Chris@42 88 }
Chris@42 89 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@42 90 T1m = VSUB(T1k, T1l);
Chris@42 91 T1K = VADD(T1k, T1l);
Chris@42 92 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 93 }
Chris@42 94 {
Chris@42 95 V Tb, Tx, Td, Th, Tj, Tw, Tg, Ti, Tv;
Chris@42 96 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 97 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@42 98 Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 99 TP = VSUB(TM, TO);
Chris@42 100 T1y = VADD(TM, TO);
Chris@42 101 Tb = BYTW(&(W[TWVL * 30]), Ta);
Chris@42 102 Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@42 103 Td = BYTW(&(W[TWVL * 10]), Tc);
Chris@42 104 Th = BYTW(&(W[TWVL * 14]), Tg);
Chris@42 105 Tj = BYTW(&(W[TWVL * 34]), Ti);
Chris@42 106 Tw = BYTW(&(W[TWVL * 16]), Tv);
Chris@42 107 {
Chris@42 108 V TA, TC, Ty, Tl;
Chris@42 109 TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 110 TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 111 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 112 Ty = BYTW(&(W[TWVL * 36]), Tx);
Chris@42 113 Te = VSUB(Tb, Td);
Chris@42 114 T1q = VADD(Tb, Td);
Chris@42 115 Tk = VSUB(Th, Tj);
Chris@42 116 T1u = VADD(Th, Tj);
Chris@42 117 TB = BYTW(&(W[0]), TA);
Chris@42 118 TD = BYTW(&(W[TWVL * 20]), TC);
Chris@42 119 Tm = BYTW(&(W[TWVL * 22]), Tl);
Chris@42 120 T1o = VADD(Tw, Ty);
Chris@42 121 Tz = VSUB(Tw, Ty);
Chris@42 122 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 123 }
Chris@42 124 }
Chris@42 125 }
Chris@42 126 Tf = VADD(T9, Te);
Chris@42 127 T14 = VSUB(T9, Te);
Chris@42 128 TQ = VSUB(TK, TP);
Chris@42 129 TZ = VADD(TK, TP);
Chris@42 130 T1r = VADD(TB, TD);
Chris@42 131 TE = VSUB(TB, TD);
Chris@42 132 T1O = VADD(T1u, T1v);
Chris@42 133 T1w = VSUB(T1u, T1v);
Chris@42 134 To = BYTW(&(W[TWVL * 2]), Tn);
Chris@42 135 T1L = VADD(T1n, T1o);
Chris@42 136 T1p = VSUB(T1n, T1o);
Chris@42 137 T1M = VADD(T1q, T1r);
Chris@42 138 T1s = VSUB(T1q, T1r);
Chris@42 139 TF = VSUB(Tz, TE);
Chris@42 140 TY = VADD(Tz, TE);
Chris@42 141 T1x = VADD(Tm, To);
Chris@42 142 Tp = VSUB(Tm, To);
Chris@42 143 }
Chris@42 144 }
Chris@42 145 {
Chris@42 146 V T1V, T1N, T12, T1b, TR, T1G, T1t, T1z, T1P, Tq, T15, T11, T1j, T10;
Chris@42 147 T1V = VSUB(T1L, T1M);
Chris@42 148 T1N = VADD(T1L, T1M);
Chris@42 149 T12 = VSUB(TY, TZ);
Chris@42 150 T10 = VADD(TY, TZ);
Chris@42 151 T1b = VFNMS(LDK(KP618033988), TF, TQ);
Chris@42 152 TR = VFMA(LDK(KP618033988), TQ, TF);
Chris@42 153 T1G = VSUB(T1p, T1s);
Chris@42 154 T1t = VADD(T1p, T1s);
Chris@42 155 T1z = VSUB(T1x, T1y);
Chris@42 156 T1P = VADD(T1x, T1y);
Chris@42 157 Tq = VADD(Tk, Tp);
Chris@42 158 T15 = VSUB(Tk, Tp);
Chris@42 159 T11 = VFNMS(LDK(KP250000000), T10, TX);
Chris@42 160 T1j = VADD(TX, T10);
Chris@42 161 {
Chris@42 162 V T1J, T1H, T1D, T1Z, T1X, T1T, T1f, T1h, T19, T17, T1C, T1S, T1a, Tu, T1F;
Chris@42 163 V T1A;
Chris@42 164 T1F = VSUB(T1w, T1z);
Chris@42 165 T1A = VADD(T1w, T1z);
Chris@42 166 {
Chris@42 167 V T1W, T1Q, Tt, Tr;
Chris@42 168 T1W = VSUB(T1O, T1P);
Chris@42 169 T1Q = VADD(T1O, T1P);
Chris@42 170 Tt = VSUB(Tf, Tq);
Chris@42 171 Tr = VADD(Tf, Tq);
Chris@42 172 {
Chris@42 173 V T1e, T16, T1d, T13;
Chris@42 174 T1e = VFNMS(LDK(KP618033988), T14, T15);
Chris@42 175 T16 = VFMA(LDK(KP618033988), T15, T14);
Chris@42 176 T1d = VFNMS(LDK(KP559016994), T12, T11);
Chris@42 177 T13 = VFMA(LDK(KP559016994), T12, T11);
Chris@42 178 T1J = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
Chris@42 179 T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
Chris@42 180 {
Chris@42 181 V T1B, T1R, Ts, T1i;
Chris@42 182 T1B = VADD(T1t, T1A);
Chris@42 183 T1D = VSUB(T1t, T1A);
Chris@42 184 T1Z = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1V, T1W));
Chris@42 185 T1X = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1W, T1V));
Chris@42 186 T1R = VADD(T1N, T1Q);
Chris@42 187 T1T = VSUB(T1N, T1Q);
Chris@42 188 Ts = VFNMS(LDK(KP250000000), Tr, T4);
Chris@42 189 T1i = VADD(T4, Tr);
Chris@42 190 T1f = VFNMS(LDK(KP951056516), T1e, T1d);
Chris@42 191 T1h = VFMA(LDK(KP951056516), T1e, T1d);
Chris@42 192 T19 = VFNMS(LDK(KP951056516), T16, T13);
Chris@42 193 T17 = VFMA(LDK(KP951056516), T16, T13);
Chris@42 194 ST(&(x[WS(rs, 10)]), VADD(T1m, T1B), ms, &(x[0]));
Chris@42 195 T1C = VFNMS(LDK(KP250000000), T1B, T1m);
Chris@42 196 ST(&(x[0]), VADD(T1K, T1R), ms, &(x[0]));
Chris@42 197 T1S = VFNMS(LDK(KP250000000), T1R, T1K);
Chris@42 198 T1a = VFNMS(LDK(KP559016994), Tt, Ts);
Chris@42 199 Tu = VFMA(LDK(KP559016994), Tt, Ts);
Chris@42 200 ST(&(x[WS(rs, 5)]), VFMAI(T1j, T1i), ms, &(x[WS(rs, 1)]));
Chris@42 201 ST(&(x[WS(rs, 15)]), VFNMSI(T1j, T1i), ms, &(x[WS(rs, 1)]));
Chris@42 202 }
Chris@42 203 }
Chris@42 204 }
Chris@42 205 {
Chris@42 206 V T1E, T1I, T1U, T1Y;
Chris@42 207 T1E = VFNMS(LDK(KP559016994), T1D, T1C);
Chris@42 208 T1I = VFMA(LDK(KP559016994), T1D, T1C);
Chris@42 209 T1U = VFMA(LDK(KP559016994), T1T, T1S);
Chris@42 210 T1Y = VFNMS(LDK(KP559016994), T1T, T1S);
Chris@42 211 {
Chris@42 212 V T1c, T1g, T18, TS;
Chris@42 213 T1c = VFMA(LDK(KP951056516), T1b, T1a);
Chris@42 214 T1g = VFNMS(LDK(KP951056516), T1b, T1a);
Chris@42 215 T18 = VFMA(LDK(KP951056516), TR, Tu);
Chris@42 216 TS = VFNMS(LDK(KP951056516), TR, Tu);
Chris@42 217 ST(&(x[WS(rs, 18)]), VFMAI(T1H, T1E), ms, &(x[0]));
Chris@42 218 ST(&(x[WS(rs, 2)]), VFNMSI(T1H, T1E), ms, &(x[0]));
Chris@42 219 ST(&(x[WS(rs, 14)]), VFNMSI(T1J, T1I), ms, &(x[0]));
Chris@42 220 ST(&(x[WS(rs, 6)]), VFMAI(T1J, T1I), ms, &(x[0]));
Chris@42 221 ST(&(x[WS(rs, 16)]), VFMAI(T1X, T1U), ms, &(x[0]));
Chris@42 222 ST(&(x[WS(rs, 4)]), VFNMSI(T1X, T1U), ms, &(x[0]));
Chris@42 223 ST(&(x[WS(rs, 12)]), VFNMSI(T1Z, T1Y), ms, &(x[0]));
Chris@42 224 ST(&(x[WS(rs, 8)]), VFMAI(T1Z, T1Y), ms, &(x[0]));
Chris@42 225 ST(&(x[WS(rs, 17)]), VFMAI(T1f, T1c), ms, &(x[WS(rs, 1)]));
Chris@42 226 ST(&(x[WS(rs, 3)]), VFNMSI(T1f, T1c), ms, &(x[WS(rs, 1)]));
Chris@42 227 ST(&(x[WS(rs, 13)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
Chris@42 228 ST(&(x[WS(rs, 7)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
Chris@42 229 ST(&(x[WS(rs, 9)]), VFMAI(T19, T18), ms, &(x[WS(rs, 1)]));
Chris@42 230 ST(&(x[WS(rs, 11)]), VFNMSI(T19, T18), ms, &(x[WS(rs, 1)]));
Chris@42 231 ST(&(x[WS(rs, 1)]), VFMAI(T17, TS), ms, &(x[WS(rs, 1)]));
Chris@42 232 ST(&(x[WS(rs, 19)]), VFNMSI(T17, TS), ms, &(x[WS(rs, 1)]));
Chris@42 233 }
Chris@42 234 }
Chris@42 235 }
Chris@42 236 }
Chris@42 237 }
Chris@42 238 }
Chris@42 239 VLEAVE();
Chris@42 240 }
Chris@42 241
Chris@42 242 static const tw_instr twinstr[] = {
Chris@42 243 VTW(0, 1),
Chris@42 244 VTW(0, 2),
Chris@42 245 VTW(0, 3),
Chris@42 246 VTW(0, 4),
Chris@42 247 VTW(0, 5),
Chris@42 248 VTW(0, 6),
Chris@42 249 VTW(0, 7),
Chris@42 250 VTW(0, 8),
Chris@42 251 VTW(0, 9),
Chris@42 252 VTW(0, 10),
Chris@42 253 VTW(0, 11),
Chris@42 254 VTW(0, 12),
Chris@42 255 VTW(0, 13),
Chris@42 256 VTW(0, 14),
Chris@42 257 VTW(0, 15),
Chris@42 258 VTW(0, 16),
Chris@42 259 VTW(0, 17),
Chris@42 260 VTW(0, 18),
Chris@42 261 VTW(0, 19),
Chris@42 262 {TW_NEXT, VL, 0}
Chris@42 263 };
Chris@42 264
Chris@42 265 static const ct_desc desc = { 20, XSIMD_STRING("t1bv_20"), twinstr, &GENUS, {77, 42, 46, 0}, 0, 0, 0 };
Chris@42 266
Chris@42 267 void XSIMD(codelet_t1bv_20) (planner *p) {
Chris@42 268 X(kdft_dit_register) (p, t1bv_20, &desc);
Chris@42 269 }
Chris@42 270 #else /* HAVE_FMA */
Chris@42 271
Chris@42 272 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t1bv_20 -include t1b.h -sign 1 */
Chris@42 273
Chris@42 274 /*
Chris@42 275 * This function contains 123 FP additions, 62 FP multiplications,
Chris@42 276 * (or, 111 additions, 50 multiplications, 12 fused multiply/add),
Chris@42 277 * 54 stack variables, 4 constants, and 40 memory accesses
Chris@42 278 */
Chris@42 279 #include "t1b.h"
Chris@42 280
Chris@42 281 static void t1bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 282 {
Chris@42 283 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 284 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 285 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 286 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 287 {
Chris@42 288 INT m;
Chris@42 289 R *x;
Chris@42 290 x = ii;
Chris@42 291 for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@42 292 V T4, T10, T1B, T1R, TF, T14, T15, TQ, Tf, Tq, Tr, T1N, T1O, T1P, T1t;
Chris@42 293 V T1w, T1D, TT, TU, T11, T1K, T1L, T1M, T1m, T1p, T1C, T1i, T1j;
Chris@42 294 {
Chris@42 295 V T1, TZ, T3, TX, TY, T2, TW, T1z, T1A;
Chris@42 296 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 297 TY = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 298 TZ = BYTW(&(W[TWVL * 28]), TY);
Chris@42 299 T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 300 T3 = BYTW(&(W[TWVL * 18]), T2);
Chris@42 301 TW = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 302 TX = BYTW(&(W[TWVL * 8]), TW);
Chris@42 303 T4 = VSUB(T1, T3);
Chris@42 304 T10 = VSUB(TX, TZ);
Chris@42 305 T1z = VADD(T1, T3);
Chris@42 306 T1A = VADD(TX, TZ);
Chris@42 307 T1B = VSUB(T1z, T1A);
Chris@42 308 T1R = VADD(T1z, T1A);
Chris@42 309 }
Chris@42 310 {
Chris@42 311 V T9, T1k, TK, T1s, TP, T1v, Te, T1n, Tk, T1r, Tz, T1l, TE, T1o, Tp;
Chris@42 312 V T1u;
Chris@42 313 {
Chris@42 314 V T6, T8, T5, T7;
Chris@42 315 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 316 T6 = BYTW(&(W[TWVL * 6]), T5);
Chris@42 317 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 318 T8 = BYTW(&(W[TWVL * 26]), T7);
Chris@42 319 T9 = VSUB(T6, T8);
Chris@42 320 T1k = VADD(T6, T8);
Chris@42 321 }
Chris@42 322 {
Chris@42 323 V TH, TJ, TG, TI;
Chris@42 324 TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 325 TH = BYTW(&(W[TWVL * 24]), TG);
Chris@42 326 TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 327 TJ = BYTW(&(W[TWVL * 4]), TI);
Chris@42 328 TK = VSUB(TH, TJ);
Chris@42 329 T1s = VADD(TH, TJ);
Chris@42 330 }
Chris@42 331 {
Chris@42 332 V TM, TO, TL, TN;
Chris@42 333 TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@42 334 TM = BYTW(&(W[TWVL * 32]), TL);
Chris@42 335 TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 336 TO = BYTW(&(W[TWVL * 12]), TN);
Chris@42 337 TP = VSUB(TM, TO);
Chris@42 338 T1v = VADD(TM, TO);
Chris@42 339 }
Chris@42 340 {
Chris@42 341 V Tb, Td, Ta, Tc;
Chris@42 342 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@42 343 Tb = BYTW(&(W[TWVL * 30]), Ta);
Chris@42 344 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 345 Td = BYTW(&(W[TWVL * 10]), Tc);
Chris@42 346 Te = VSUB(Tb, Td);
Chris@42 347 T1n = VADD(Tb, Td);
Chris@42 348 }
Chris@42 349 {
Chris@42 350 V Th, Tj, Tg, Ti;
Chris@42 351 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 352 Th = BYTW(&(W[TWVL * 14]), Tg);
Chris@42 353 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@42 354 Tj = BYTW(&(W[TWVL * 34]), Ti);
Chris@42 355 Tk = VSUB(Th, Tj);
Chris@42 356 T1r = VADD(Th, Tj);
Chris@42 357 }
Chris@42 358 {
Chris@42 359 V Tw, Ty, Tv, Tx;
Chris@42 360 Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 361 Tw = BYTW(&(W[TWVL * 16]), Tv);
Chris@42 362 Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@42 363 Ty = BYTW(&(W[TWVL * 36]), Tx);
Chris@42 364 Tz = VSUB(Tw, Ty);
Chris@42 365 T1l = VADD(Tw, Ty);
Chris@42 366 }
Chris@42 367 {
Chris@42 368 V TB, TD, TA, TC;
Chris@42 369 TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 370 TB = BYTW(&(W[0]), TA);
Chris@42 371 TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 372 TD = BYTW(&(W[TWVL * 20]), TC);
Chris@42 373 TE = VSUB(TB, TD);
Chris@42 374 T1o = VADD(TB, TD);
Chris@42 375 }
Chris@42 376 {
Chris@42 377 V Tm, To, Tl, Tn;
Chris@42 378 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 379 Tm = BYTW(&(W[TWVL * 22]), Tl);
Chris@42 380 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 381 To = BYTW(&(W[TWVL * 2]), Tn);
Chris@42 382 Tp = VSUB(Tm, To);
Chris@42 383 T1u = VADD(Tm, To);
Chris@42 384 }
Chris@42 385 TF = VSUB(Tz, TE);
Chris@42 386 T14 = VSUB(T9, Te);
Chris@42 387 T15 = VSUB(Tk, Tp);
Chris@42 388 TQ = VSUB(TK, TP);
Chris@42 389 Tf = VADD(T9, Te);
Chris@42 390 Tq = VADD(Tk, Tp);
Chris@42 391 Tr = VADD(Tf, Tq);
Chris@42 392 T1N = VADD(T1r, T1s);
Chris@42 393 T1O = VADD(T1u, T1v);
Chris@42 394 T1P = VADD(T1N, T1O);
Chris@42 395 T1t = VSUB(T1r, T1s);
Chris@42 396 T1w = VSUB(T1u, T1v);
Chris@42 397 T1D = VADD(T1t, T1w);
Chris@42 398 TT = VADD(Tz, TE);
Chris@42 399 TU = VADD(TK, TP);
Chris@42 400 T11 = VADD(TT, TU);
Chris@42 401 T1K = VADD(T1k, T1l);
Chris@42 402 T1L = VADD(T1n, T1o);
Chris@42 403 T1M = VADD(T1K, T1L);
Chris@42 404 T1m = VSUB(T1k, T1l);
Chris@42 405 T1p = VSUB(T1n, T1o);
Chris@42 406 T1C = VADD(T1m, T1p);
Chris@42 407 }
Chris@42 408 T1i = VADD(T4, Tr);
Chris@42 409 T1j = VBYI(VADD(T10, T11));
Chris@42 410 ST(&(x[WS(rs, 15)]), VSUB(T1i, T1j), ms, &(x[WS(rs, 1)]));
Chris@42 411 ST(&(x[WS(rs, 5)]), VADD(T1i, T1j), ms, &(x[WS(rs, 1)]));
Chris@42 412 {
Chris@42 413 V T1Q, T1S, T1T, T1X, T1Z, T1V, T1W, T1Y, T1U;
Chris@42 414 T1Q = VMUL(LDK(KP559016994), VSUB(T1M, T1P));
Chris@42 415 T1S = VADD(T1M, T1P);
Chris@42 416 T1T = VFNMS(LDK(KP250000000), T1S, T1R);
Chris@42 417 T1V = VSUB(T1K, T1L);
Chris@42 418 T1W = VSUB(T1N, T1O);
Chris@42 419 T1X = VBYI(VFMA(LDK(KP951056516), T1V, VMUL(LDK(KP587785252), T1W)));
Chris@42 420 T1Z = VBYI(VFNMS(LDK(KP951056516), T1W, VMUL(LDK(KP587785252), T1V)));
Chris@42 421 ST(&(x[0]), VADD(T1R, T1S), ms, &(x[0]));
Chris@42 422 T1Y = VSUB(T1T, T1Q);
Chris@42 423 ST(&(x[WS(rs, 8)]), VSUB(T1Y, T1Z), ms, &(x[0]));
Chris@42 424 ST(&(x[WS(rs, 12)]), VADD(T1Z, T1Y), ms, &(x[0]));
Chris@42 425 T1U = VADD(T1Q, T1T);
Chris@42 426 ST(&(x[WS(rs, 4)]), VSUB(T1U, T1X), ms, &(x[0]));
Chris@42 427 ST(&(x[WS(rs, 16)]), VADD(T1X, T1U), ms, &(x[0]));
Chris@42 428 }
Chris@42 429 {
Chris@42 430 V T1G, T1E, T1F, T1y, T1I, T1q, T1x, T1J, T1H;
Chris@42 431 T1G = VMUL(LDK(KP559016994), VSUB(T1C, T1D));
Chris@42 432 T1E = VADD(T1C, T1D);
Chris@42 433 T1F = VFNMS(LDK(KP250000000), T1E, T1B);
Chris@42 434 T1q = VSUB(T1m, T1p);
Chris@42 435 T1x = VSUB(T1t, T1w);
Chris@42 436 T1y = VBYI(VFNMS(LDK(KP951056516), T1x, VMUL(LDK(KP587785252), T1q)));
Chris@42 437 T1I = VBYI(VFMA(LDK(KP951056516), T1q, VMUL(LDK(KP587785252), T1x)));
Chris@42 438 ST(&(x[WS(rs, 10)]), VADD(T1B, T1E), ms, &(x[0]));
Chris@42 439 T1J = VADD(T1G, T1F);
Chris@42 440 ST(&(x[WS(rs, 6)]), VADD(T1I, T1J), ms, &(x[0]));
Chris@42 441 ST(&(x[WS(rs, 14)]), VSUB(T1J, T1I), ms, &(x[0]));
Chris@42 442 T1H = VSUB(T1F, T1G);
Chris@42 443 ST(&(x[WS(rs, 2)]), VADD(T1y, T1H), ms, &(x[0]));
Chris@42 444 ST(&(x[WS(rs, 18)]), VSUB(T1H, T1y), ms, &(x[0]));
Chris@42 445 }
Chris@42 446 {
Chris@42 447 V TR, T16, T1d, T1b, T13, T1e, Tu, T1a;
Chris@42 448 TR = VFNMS(LDK(KP951056516), TQ, VMUL(LDK(KP587785252), TF));
Chris@42 449 T16 = VFNMS(LDK(KP951056516), T15, VMUL(LDK(KP587785252), T14));
Chris@42 450 T1d = VFMA(LDK(KP951056516), T14, VMUL(LDK(KP587785252), T15));
Chris@42 451 T1b = VFMA(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TQ));
Chris@42 452 {
Chris@42 453 V TV, T12, Ts, Tt;
Chris@42 454 TV = VMUL(LDK(KP559016994), VSUB(TT, TU));
Chris@42 455 T12 = VFNMS(LDK(KP250000000), T11, T10);
Chris@42 456 T13 = VSUB(TV, T12);
Chris@42 457 T1e = VADD(TV, T12);
Chris@42 458 Ts = VFNMS(LDK(KP250000000), Tr, T4);
Chris@42 459 Tt = VMUL(LDK(KP559016994), VSUB(Tf, Tq));
Chris@42 460 Tu = VSUB(Ts, Tt);
Chris@42 461 T1a = VADD(Tt, Ts);
Chris@42 462 }
Chris@42 463 {
Chris@42 464 V TS, T17, T1g, T1h;
Chris@42 465 TS = VSUB(Tu, TR);
Chris@42 466 T17 = VBYI(VSUB(T13, T16));
Chris@42 467 ST(&(x[WS(rs, 17)]), VSUB(TS, T17), ms, &(x[WS(rs, 1)]));
Chris@42 468 ST(&(x[WS(rs, 3)]), VADD(TS, T17), ms, &(x[WS(rs, 1)]));
Chris@42 469 T1g = VADD(T1a, T1b);
Chris@42 470 T1h = VBYI(VSUB(T1e, T1d));
Chris@42 471 ST(&(x[WS(rs, 11)]), VSUB(T1g, T1h), ms, &(x[WS(rs, 1)]));
Chris@42 472 ST(&(x[WS(rs, 9)]), VADD(T1g, T1h), ms, &(x[WS(rs, 1)]));
Chris@42 473 }
Chris@42 474 {
Chris@42 475 V T18, T19, T1c, T1f;
Chris@42 476 T18 = VADD(Tu, TR);
Chris@42 477 T19 = VBYI(VADD(T16, T13));
Chris@42 478 ST(&(x[WS(rs, 13)]), VSUB(T18, T19), ms, &(x[WS(rs, 1)]));
Chris@42 479 ST(&(x[WS(rs, 7)]), VADD(T18, T19), ms, &(x[WS(rs, 1)]));
Chris@42 480 T1c = VSUB(T1a, T1b);
Chris@42 481 T1f = VBYI(VADD(T1d, T1e));
Chris@42 482 ST(&(x[WS(rs, 19)]), VSUB(T1c, T1f), ms, &(x[WS(rs, 1)]));
Chris@42 483 ST(&(x[WS(rs, 1)]), VADD(T1c, T1f), ms, &(x[WS(rs, 1)]));
Chris@42 484 }
Chris@42 485 }
Chris@42 486 }
Chris@42 487 }
Chris@42 488 VLEAVE();
Chris@42 489 }
Chris@42 490
Chris@42 491 static const tw_instr twinstr[] = {
Chris@42 492 VTW(0, 1),
Chris@42 493 VTW(0, 2),
Chris@42 494 VTW(0, 3),
Chris@42 495 VTW(0, 4),
Chris@42 496 VTW(0, 5),
Chris@42 497 VTW(0, 6),
Chris@42 498 VTW(0, 7),
Chris@42 499 VTW(0, 8),
Chris@42 500 VTW(0, 9),
Chris@42 501 VTW(0, 10),
Chris@42 502 VTW(0, 11),
Chris@42 503 VTW(0, 12),
Chris@42 504 VTW(0, 13),
Chris@42 505 VTW(0, 14),
Chris@42 506 VTW(0, 15),
Chris@42 507 VTW(0, 16),
Chris@42 508 VTW(0, 17),
Chris@42 509 VTW(0, 18),
Chris@42 510 VTW(0, 19),
Chris@42 511 {TW_NEXT, VL, 0}
Chris@42 512 };
Chris@42 513
Chris@42 514 static const ct_desc desc = { 20, XSIMD_STRING("t1bv_20"), twinstr, &GENUS, {111, 50, 12, 0}, 0, 0, 0 };
Chris@42 515
Chris@42 516 void XSIMD(codelet_t1bv_20) (planner *p) {
Chris@42 517 X(kdft_dit_register) (p, t1bv_20, &desc);
Chris@42 518 }
Chris@42 519 #endif /* HAVE_FMA */