annotate src/fftw-3.3.5/dft/simd/common/t1fv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:43:07 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t1fv_20 -include t1f.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 123 FP additions, 88 FP multiplications,
Chris@42 32 * (or, 77 additions, 42 multiplications, 46 fused multiply/add),
Chris@42 33 * 68 stack variables, 4 constants, and 40 memory accesses
Chris@42 34 */
Chris@42 35 #include "t1f.h"
Chris@42 36
Chris@42 37 static void t1fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 R *x;
Chris@42 46 x = ri;
Chris@42 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@42 48 V T4, Tx, T1m, T1K, T1y, Tk, Tf, T16, T10, TT, T1O, T1w, T1L, T1p, T1M;
Chris@42 49 V T1s, TZ, TI, T1x, Tp;
Chris@42 50 {
Chris@42 51 V T1, Tv, T2, Tt;
Chris@42 52 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 53 Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 54 T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 55 Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 56 {
Chris@42 57 V T9, T1n, TN, T1v, TS, Te, T1q, T1u, TE, TG, Tm, T1o, TC, Tn, T1r;
Chris@42 58 V TH, To;
Chris@42 59 {
Chris@42 60 V TP, TR, Ta, Tc;
Chris@42 61 {
Chris@42 62 V T5, T7, TJ, TL, T1k, T1l;
Chris@42 63 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 64 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 65 TJ = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 66 TL = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 67 {
Chris@42 68 V Tw, T3, Tu, T6, T8, TK, TM, TO, TQ;
Chris@42 69 TO = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@42 70 Tw = BYTWJ(&(W[TWVL * 28]), Tv);
Chris@42 71 T3 = BYTWJ(&(W[TWVL * 18]), T2);
Chris@42 72 Tu = BYTWJ(&(W[TWVL * 8]), Tt);
Chris@42 73 T6 = BYTWJ(&(W[TWVL * 6]), T5);
Chris@42 74 T8 = BYTWJ(&(W[TWVL * 26]), T7);
Chris@42 75 TK = BYTWJ(&(W[TWVL * 24]), TJ);
Chris@42 76 TM = BYTWJ(&(W[TWVL * 4]), TL);
Chris@42 77 TP = BYTWJ(&(W[TWVL * 32]), TO);
Chris@42 78 TQ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 79 T4 = VSUB(T1, T3);
Chris@42 80 T1k = VADD(T1, T3);
Chris@42 81 Tx = VSUB(Tu, Tw);
Chris@42 82 T1l = VADD(Tu, Tw);
Chris@42 83 T9 = VSUB(T6, T8);
Chris@42 84 T1n = VADD(T6, T8);
Chris@42 85 TN = VSUB(TK, TM);
Chris@42 86 T1v = VADD(TK, TM);
Chris@42 87 TR = BYTWJ(&(W[TWVL * 12]), TQ);
Chris@42 88 }
Chris@42 89 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@42 90 T1m = VSUB(T1k, T1l);
Chris@42 91 T1K = VADD(T1k, T1l);
Chris@42 92 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 93 }
Chris@42 94 {
Chris@42 95 V Tb, TA, Td, Th, Tj, Tz, Tg, Ti, Ty;
Chris@42 96 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 97 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@42 98 Ty = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 99 TS = VSUB(TP, TR);
Chris@42 100 T1y = VADD(TP, TR);
Chris@42 101 Tb = BYTWJ(&(W[TWVL * 30]), Ta);
Chris@42 102 TA = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@42 103 Td = BYTWJ(&(W[TWVL * 10]), Tc);
Chris@42 104 Th = BYTWJ(&(W[TWVL * 14]), Tg);
Chris@42 105 Tj = BYTWJ(&(W[TWVL * 34]), Ti);
Chris@42 106 Tz = BYTWJ(&(W[TWVL * 16]), Ty);
Chris@42 107 {
Chris@42 108 V TD, TF, TB, Tl;
Chris@42 109 TD = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 110 TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 111 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 112 TB = BYTWJ(&(W[TWVL * 36]), TA);
Chris@42 113 Te = VSUB(Tb, Td);
Chris@42 114 T1q = VADD(Tb, Td);
Chris@42 115 Tk = VSUB(Th, Tj);
Chris@42 116 T1u = VADD(Th, Tj);
Chris@42 117 TE = BYTWJ(&(W[0]), TD);
Chris@42 118 TG = BYTWJ(&(W[TWVL * 20]), TF);
Chris@42 119 Tm = BYTWJ(&(W[TWVL * 22]), Tl);
Chris@42 120 T1o = VADD(Tz, TB);
Chris@42 121 TC = VSUB(Tz, TB);
Chris@42 122 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 123 }
Chris@42 124 }
Chris@42 125 }
Chris@42 126 Tf = VADD(T9, Te);
Chris@42 127 T16 = VSUB(T9, Te);
Chris@42 128 T10 = VSUB(TS, TN);
Chris@42 129 TT = VADD(TN, TS);
Chris@42 130 T1r = VADD(TE, TG);
Chris@42 131 TH = VSUB(TE, TG);
Chris@42 132 T1O = VADD(T1u, T1v);
Chris@42 133 T1w = VSUB(T1u, T1v);
Chris@42 134 To = BYTWJ(&(W[TWVL * 2]), Tn);
Chris@42 135 T1L = VADD(T1n, T1o);
Chris@42 136 T1p = VSUB(T1n, T1o);
Chris@42 137 T1M = VADD(T1q, T1r);
Chris@42 138 T1s = VSUB(T1q, T1r);
Chris@42 139 TZ = VSUB(TH, TC);
Chris@42 140 TI = VADD(TC, TH);
Chris@42 141 T1x = VADD(Tm, To);
Chris@42 142 Tp = VSUB(Tm, To);
Chris@42 143 }
Chris@42 144 }
Chris@42 145 {
Chris@42 146 V T1V, T1N, T14, T1d, T11, T1G, T1t, T1z, T1P, Tq, T17, T13, TV, TU;
Chris@42 147 T1V = VSUB(T1L, T1M);
Chris@42 148 T1N = VADD(T1L, T1M);
Chris@42 149 T14 = VSUB(TT, TI);
Chris@42 150 TU = VADD(TI, TT);
Chris@42 151 T1d = VFNMS(LDK(KP618033988), TZ, T10);
Chris@42 152 T11 = VFMA(LDK(KP618033988), T10, TZ);
Chris@42 153 T1G = VSUB(T1p, T1s);
Chris@42 154 T1t = VADD(T1p, T1s);
Chris@42 155 T1z = VSUB(T1x, T1y);
Chris@42 156 T1P = VADD(T1x, T1y);
Chris@42 157 Tq = VADD(Tk, Tp);
Chris@42 158 T17 = VSUB(Tk, Tp);
Chris@42 159 T13 = VFNMS(LDK(KP250000000), TU, Tx);
Chris@42 160 TV = VADD(Tx, TU);
Chris@42 161 {
Chris@42 162 V T1J, T1H, T1D, T1Z, T1X, T1T, T1h, T1j, T1b, T19, T1C, T1S, T1c, TY, T1F;
Chris@42 163 V T1A;
Chris@42 164 T1F = VSUB(T1w, T1z);
Chris@42 165 T1A = VADD(T1w, T1z);
Chris@42 166 {
Chris@42 167 V T1W, T1Q, TX, Tr;
Chris@42 168 T1W = VSUB(T1O, T1P);
Chris@42 169 T1Q = VADD(T1O, T1P);
Chris@42 170 TX = VSUB(Tf, Tq);
Chris@42 171 Tr = VADD(Tf, Tq);
Chris@42 172 {
Chris@42 173 V T1g, T18, T1f, T15;
Chris@42 174 T1g = VFNMS(LDK(KP618033988), T16, T17);
Chris@42 175 T18 = VFMA(LDK(KP618033988), T17, T16);
Chris@42 176 T1f = VFMA(LDK(KP559016994), T14, T13);
Chris@42 177 T15 = VFNMS(LDK(KP559016994), T14, T13);
Chris@42 178 T1J = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
Chris@42 179 T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
Chris@42 180 {
Chris@42 181 V T1B, T1R, TW, Ts;
Chris@42 182 T1B = VADD(T1t, T1A);
Chris@42 183 T1D = VSUB(T1t, T1A);
Chris@42 184 T1Z = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1V, T1W));
Chris@42 185 T1X = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1W, T1V));
Chris@42 186 T1R = VADD(T1N, T1Q);
Chris@42 187 T1T = VSUB(T1N, T1Q);
Chris@42 188 TW = VFNMS(LDK(KP250000000), Tr, T4);
Chris@42 189 Ts = VADD(T4, Tr);
Chris@42 190 T1h = VFNMS(LDK(KP951056516), T1g, T1f);
Chris@42 191 T1j = VFMA(LDK(KP951056516), T1g, T1f);
Chris@42 192 T1b = VFNMS(LDK(KP951056516), T18, T15);
Chris@42 193 T19 = VFMA(LDK(KP951056516), T18, T15);
Chris@42 194 ST(&(x[WS(rs, 10)]), VADD(T1m, T1B), ms, &(x[0]));
Chris@42 195 T1C = VFNMS(LDK(KP250000000), T1B, T1m);
Chris@42 196 ST(&(x[0]), VADD(T1K, T1R), ms, &(x[0]));
Chris@42 197 T1S = VFNMS(LDK(KP250000000), T1R, T1K);
Chris@42 198 T1c = VFNMS(LDK(KP559016994), TX, TW);
Chris@42 199 TY = VFMA(LDK(KP559016994), TX, TW);
Chris@42 200 ST(&(x[WS(rs, 15)]), VFMAI(TV, Ts), ms, &(x[WS(rs, 1)]));
Chris@42 201 ST(&(x[WS(rs, 5)]), VFNMSI(TV, Ts), ms, &(x[WS(rs, 1)]));
Chris@42 202 }
Chris@42 203 }
Chris@42 204 }
Chris@42 205 {
Chris@42 206 V T1E, T1I, T1U, T1Y;
Chris@42 207 T1E = VFNMS(LDK(KP559016994), T1D, T1C);
Chris@42 208 T1I = VFMA(LDK(KP559016994), T1D, T1C);
Chris@42 209 T1U = VFMA(LDK(KP559016994), T1T, T1S);
Chris@42 210 T1Y = VFNMS(LDK(KP559016994), T1T, T1S);
Chris@42 211 {
Chris@42 212 V T1e, T1i, T1a, T12;
Chris@42 213 T1e = VFNMS(LDK(KP951056516), T1d, T1c);
Chris@42 214 T1i = VFMA(LDK(KP951056516), T1d, T1c);
Chris@42 215 T1a = VFNMS(LDK(KP951056516), T11, TY);
Chris@42 216 T12 = VFMA(LDK(KP951056516), T11, TY);
Chris@42 217 ST(&(x[WS(rs, 18)]), VFNMSI(T1H, T1E), ms, &(x[0]));
Chris@42 218 ST(&(x[WS(rs, 2)]), VFMAI(T1H, T1E), ms, &(x[0]));
Chris@42 219 ST(&(x[WS(rs, 14)]), VFMAI(T1J, T1I), ms, &(x[0]));
Chris@42 220 ST(&(x[WS(rs, 6)]), VFNMSI(T1J, T1I), ms, &(x[0]));
Chris@42 221 ST(&(x[WS(rs, 16)]), VFNMSI(T1X, T1U), ms, &(x[0]));
Chris@42 222 ST(&(x[WS(rs, 4)]), VFMAI(T1X, T1U), ms, &(x[0]));
Chris@42 223 ST(&(x[WS(rs, 12)]), VFMAI(T1Z, T1Y), ms, &(x[0]));
Chris@42 224 ST(&(x[WS(rs, 8)]), VFNMSI(T1Z, T1Y), ms, &(x[0]));
Chris@42 225 ST(&(x[WS(rs, 3)]), VFMAI(T1h, T1e), ms, &(x[WS(rs, 1)]));
Chris@42 226 ST(&(x[WS(rs, 17)]), VFNMSI(T1h, T1e), ms, &(x[WS(rs, 1)]));
Chris@42 227 ST(&(x[WS(rs, 7)]), VFMAI(T1j, T1i), ms, &(x[WS(rs, 1)]));
Chris@42 228 ST(&(x[WS(rs, 13)]), VFNMSI(T1j, T1i), ms, &(x[WS(rs, 1)]));
Chris@42 229 ST(&(x[WS(rs, 11)]), VFMAI(T1b, T1a), ms, &(x[WS(rs, 1)]));
Chris@42 230 ST(&(x[WS(rs, 9)]), VFNMSI(T1b, T1a), ms, &(x[WS(rs, 1)]));
Chris@42 231 ST(&(x[WS(rs, 19)]), VFMAI(T19, T12), ms, &(x[WS(rs, 1)]));
Chris@42 232 ST(&(x[WS(rs, 1)]), VFNMSI(T19, T12), ms, &(x[WS(rs, 1)]));
Chris@42 233 }
Chris@42 234 }
Chris@42 235 }
Chris@42 236 }
Chris@42 237 }
Chris@42 238 }
Chris@42 239 VLEAVE();
Chris@42 240 }
Chris@42 241
Chris@42 242 static const tw_instr twinstr[] = {
Chris@42 243 VTW(0, 1),
Chris@42 244 VTW(0, 2),
Chris@42 245 VTW(0, 3),
Chris@42 246 VTW(0, 4),
Chris@42 247 VTW(0, 5),
Chris@42 248 VTW(0, 6),
Chris@42 249 VTW(0, 7),
Chris@42 250 VTW(0, 8),
Chris@42 251 VTW(0, 9),
Chris@42 252 VTW(0, 10),
Chris@42 253 VTW(0, 11),
Chris@42 254 VTW(0, 12),
Chris@42 255 VTW(0, 13),
Chris@42 256 VTW(0, 14),
Chris@42 257 VTW(0, 15),
Chris@42 258 VTW(0, 16),
Chris@42 259 VTW(0, 17),
Chris@42 260 VTW(0, 18),
Chris@42 261 VTW(0, 19),
Chris@42 262 {TW_NEXT, VL, 0}
Chris@42 263 };
Chris@42 264
Chris@42 265 static const ct_desc desc = { 20, XSIMD_STRING("t1fv_20"), twinstr, &GENUS, {77, 42, 46, 0}, 0, 0, 0 };
Chris@42 266
Chris@42 267 void XSIMD(codelet_t1fv_20) (planner *p) {
Chris@42 268 X(kdft_dit_register) (p, t1fv_20, &desc);
Chris@42 269 }
Chris@42 270 #else /* HAVE_FMA */
Chris@42 271
Chris@42 272 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t1fv_20 -include t1f.h */
Chris@42 273
Chris@42 274 /*
Chris@42 275 * This function contains 123 FP additions, 62 FP multiplications,
Chris@42 276 * (or, 111 additions, 50 multiplications, 12 fused multiply/add),
Chris@42 277 * 54 stack variables, 4 constants, and 40 memory accesses
Chris@42 278 */
Chris@42 279 #include "t1f.h"
Chris@42 280
Chris@42 281 static void t1fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 282 {
Chris@42 283 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 284 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 285 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 286 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 287 {
Chris@42 288 INT m;
Chris@42 289 R *x;
Chris@42 290 x = ri;
Chris@42 291 for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@42 292 V T4, Tx, T1B, T1U, TZ, T16, T17, T10, Tf, Tq, Tr, T1N, T1O, T1S, T1t;
Chris@42 293 V T1w, T1C, TI, TT, TU, T1K, T1L, T1R, T1m, T1p, T1D, Ts, TV;
Chris@42 294 {
Chris@42 295 V T1, Tw, T3, Tu, Tv, T2, Tt, T1z, T1A;
Chris@42 296 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 297 Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 298 Tw = BYTWJ(&(W[TWVL * 28]), Tv);
Chris@42 299 T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 300 T3 = BYTWJ(&(W[TWVL * 18]), T2);
Chris@42 301 Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 302 Tu = BYTWJ(&(W[TWVL * 8]), Tt);
Chris@42 303 T4 = VSUB(T1, T3);
Chris@42 304 Tx = VSUB(Tu, Tw);
Chris@42 305 T1z = VADD(T1, T3);
Chris@42 306 T1A = VADD(Tu, Tw);
Chris@42 307 T1B = VSUB(T1z, T1A);
Chris@42 308 T1U = VADD(T1z, T1A);
Chris@42 309 }
Chris@42 310 {
Chris@42 311 V T9, T1r, TN, T1l, TS, T1o, Te, T1u, Tk, T1k, TC, T1s, TH, T1v, Tp;
Chris@42 312 V T1n;
Chris@42 313 {
Chris@42 314 V T6, T8, T5, T7;
Chris@42 315 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 316 T6 = BYTWJ(&(W[TWVL * 6]), T5);
Chris@42 317 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 318 T8 = BYTWJ(&(W[TWVL * 26]), T7);
Chris@42 319 T9 = VSUB(T6, T8);
Chris@42 320 T1r = VADD(T6, T8);
Chris@42 321 }
Chris@42 322 {
Chris@42 323 V TK, TM, TJ, TL;
Chris@42 324 TJ = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 325 TK = BYTWJ(&(W[TWVL * 24]), TJ);
Chris@42 326 TL = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 327 TM = BYTWJ(&(W[TWVL * 4]), TL);
Chris@42 328 TN = VSUB(TK, TM);
Chris@42 329 T1l = VADD(TK, TM);
Chris@42 330 }
Chris@42 331 {
Chris@42 332 V TP, TR, TO, TQ;
Chris@42 333 TO = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@42 334 TP = BYTWJ(&(W[TWVL * 32]), TO);
Chris@42 335 TQ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 336 TR = BYTWJ(&(W[TWVL * 12]), TQ);
Chris@42 337 TS = VSUB(TP, TR);
Chris@42 338 T1o = VADD(TP, TR);
Chris@42 339 }
Chris@42 340 {
Chris@42 341 V Tb, Td, Ta, Tc;
Chris@42 342 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@42 343 Tb = BYTWJ(&(W[TWVL * 30]), Ta);
Chris@42 344 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 345 Td = BYTWJ(&(W[TWVL * 10]), Tc);
Chris@42 346 Te = VSUB(Tb, Td);
Chris@42 347 T1u = VADD(Tb, Td);
Chris@42 348 }
Chris@42 349 {
Chris@42 350 V Th, Tj, Tg, Ti;
Chris@42 351 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 352 Th = BYTWJ(&(W[TWVL * 14]), Tg);
Chris@42 353 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@42 354 Tj = BYTWJ(&(W[TWVL * 34]), Ti);
Chris@42 355 Tk = VSUB(Th, Tj);
Chris@42 356 T1k = VADD(Th, Tj);
Chris@42 357 }
Chris@42 358 {
Chris@42 359 V Tz, TB, Ty, TA;
Chris@42 360 Ty = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 361 Tz = BYTWJ(&(W[TWVL * 16]), Ty);
Chris@42 362 TA = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@42 363 TB = BYTWJ(&(W[TWVL * 36]), TA);
Chris@42 364 TC = VSUB(Tz, TB);
Chris@42 365 T1s = VADD(Tz, TB);
Chris@42 366 }
Chris@42 367 {
Chris@42 368 V TE, TG, TD, TF;
Chris@42 369 TD = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 370 TE = BYTWJ(&(W[0]), TD);
Chris@42 371 TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 372 TG = BYTWJ(&(W[TWVL * 20]), TF);
Chris@42 373 TH = VSUB(TE, TG);
Chris@42 374 T1v = VADD(TE, TG);
Chris@42 375 }
Chris@42 376 {
Chris@42 377 V Tm, To, Tl, Tn;
Chris@42 378 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 379 Tm = BYTWJ(&(W[TWVL * 22]), Tl);
Chris@42 380 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 381 To = BYTWJ(&(W[TWVL * 2]), Tn);
Chris@42 382 Tp = VSUB(Tm, To);
Chris@42 383 T1n = VADD(Tm, To);
Chris@42 384 }
Chris@42 385 TZ = VSUB(TH, TC);
Chris@42 386 T16 = VSUB(T9, Te);
Chris@42 387 T17 = VSUB(Tk, Tp);
Chris@42 388 T10 = VSUB(TS, TN);
Chris@42 389 Tf = VADD(T9, Te);
Chris@42 390 Tq = VADD(Tk, Tp);
Chris@42 391 Tr = VADD(Tf, Tq);
Chris@42 392 T1N = VADD(T1k, T1l);
Chris@42 393 T1O = VADD(T1n, T1o);
Chris@42 394 T1S = VADD(T1N, T1O);
Chris@42 395 T1t = VSUB(T1r, T1s);
Chris@42 396 T1w = VSUB(T1u, T1v);
Chris@42 397 T1C = VADD(T1t, T1w);
Chris@42 398 TI = VADD(TC, TH);
Chris@42 399 TT = VADD(TN, TS);
Chris@42 400 TU = VADD(TI, TT);
Chris@42 401 T1K = VADD(T1r, T1s);
Chris@42 402 T1L = VADD(T1u, T1v);
Chris@42 403 T1R = VADD(T1K, T1L);
Chris@42 404 T1m = VSUB(T1k, T1l);
Chris@42 405 T1p = VSUB(T1n, T1o);
Chris@42 406 T1D = VADD(T1m, T1p);
Chris@42 407 }
Chris@42 408 Ts = VADD(T4, Tr);
Chris@42 409 TV = VBYI(VADD(Tx, TU));
Chris@42 410 ST(&(x[WS(rs, 5)]), VSUB(Ts, TV), ms, &(x[WS(rs, 1)]));
Chris@42 411 ST(&(x[WS(rs, 15)]), VADD(Ts, TV), ms, &(x[WS(rs, 1)]));
Chris@42 412 {
Chris@42 413 V T1T, T1V, T1W, T1Q, T1Z, T1M, T1P, T1Y, T1X;
Chris@42 414 T1T = VMUL(LDK(KP559016994), VSUB(T1R, T1S));
Chris@42 415 T1V = VADD(T1R, T1S);
Chris@42 416 T1W = VFNMS(LDK(KP250000000), T1V, T1U);
Chris@42 417 T1M = VSUB(T1K, T1L);
Chris@42 418 T1P = VSUB(T1N, T1O);
Chris@42 419 T1Q = VBYI(VFMA(LDK(KP951056516), T1M, VMUL(LDK(KP587785252), T1P)));
Chris@42 420 T1Z = VBYI(VFNMS(LDK(KP587785252), T1M, VMUL(LDK(KP951056516), T1P)));
Chris@42 421 ST(&(x[0]), VADD(T1U, T1V), ms, &(x[0]));
Chris@42 422 T1Y = VSUB(T1W, T1T);
Chris@42 423 ST(&(x[WS(rs, 8)]), VSUB(T1Y, T1Z), ms, &(x[0]));
Chris@42 424 ST(&(x[WS(rs, 12)]), VADD(T1Z, T1Y), ms, &(x[0]));
Chris@42 425 T1X = VADD(T1T, T1W);
Chris@42 426 ST(&(x[WS(rs, 4)]), VADD(T1Q, T1X), ms, &(x[0]));
Chris@42 427 ST(&(x[WS(rs, 16)]), VSUB(T1X, T1Q), ms, &(x[0]));
Chris@42 428 }
Chris@42 429 {
Chris@42 430 V T1G, T1E, T1F, T1y, T1J, T1q, T1x, T1I, T1H;
Chris@42 431 T1G = VMUL(LDK(KP559016994), VSUB(T1C, T1D));
Chris@42 432 T1E = VADD(T1C, T1D);
Chris@42 433 T1F = VFNMS(LDK(KP250000000), T1E, T1B);
Chris@42 434 T1q = VSUB(T1m, T1p);
Chris@42 435 T1x = VSUB(T1t, T1w);
Chris@42 436 T1y = VBYI(VFNMS(LDK(KP587785252), T1x, VMUL(LDK(KP951056516), T1q)));
Chris@42 437 T1J = VBYI(VFMA(LDK(KP951056516), T1x, VMUL(LDK(KP587785252), T1q)));
Chris@42 438 ST(&(x[WS(rs, 10)]), VADD(T1B, T1E), ms, &(x[0]));
Chris@42 439 T1I = VADD(T1G, T1F);
Chris@42 440 ST(&(x[WS(rs, 6)]), VSUB(T1I, T1J), ms, &(x[0]));
Chris@42 441 ST(&(x[WS(rs, 14)]), VADD(T1J, T1I), ms, &(x[0]));
Chris@42 442 T1H = VSUB(T1F, T1G);
Chris@42 443 ST(&(x[WS(rs, 2)]), VADD(T1y, T1H), ms, &(x[0]));
Chris@42 444 ST(&(x[WS(rs, 18)]), VSUB(T1H, T1y), ms, &(x[0]));
Chris@42 445 }
Chris@42 446 {
Chris@42 447 V T11, T18, T1g, T1d, T15, T1f, TY, T1c;
Chris@42 448 T11 = VFMA(LDK(KP951056516), TZ, VMUL(LDK(KP587785252), T10));
Chris@42 449 T18 = VFMA(LDK(KP951056516), T16, VMUL(LDK(KP587785252), T17));
Chris@42 450 T1g = VFNMS(LDK(KP587785252), T16, VMUL(LDK(KP951056516), T17));
Chris@42 451 T1d = VFNMS(LDK(KP587785252), TZ, VMUL(LDK(KP951056516), T10));
Chris@42 452 {
Chris@42 453 V T13, T14, TW, TX;
Chris@42 454 T13 = VFMS(LDK(KP250000000), TU, Tx);
Chris@42 455 T14 = VMUL(LDK(KP559016994), VSUB(TT, TI));
Chris@42 456 T15 = VADD(T13, T14);
Chris@42 457 T1f = VSUB(T14, T13);
Chris@42 458 TW = VMUL(LDK(KP559016994), VSUB(Tf, Tq));
Chris@42 459 TX = VFNMS(LDK(KP250000000), Tr, T4);
Chris@42 460 TY = VADD(TW, TX);
Chris@42 461 T1c = VSUB(TX, TW);
Chris@42 462 }
Chris@42 463 {
Chris@42 464 V T12, T19, T1i, T1j;
Chris@42 465 T12 = VADD(TY, T11);
Chris@42 466 T19 = VBYI(VSUB(T15, T18));
Chris@42 467 ST(&(x[WS(rs, 19)]), VSUB(T12, T19), ms, &(x[WS(rs, 1)]));
Chris@42 468 ST(&(x[WS(rs, 1)]), VADD(T12, T19), ms, &(x[WS(rs, 1)]));
Chris@42 469 T1i = VADD(T1c, T1d);
Chris@42 470 T1j = VBYI(VADD(T1g, T1f));
Chris@42 471 ST(&(x[WS(rs, 13)]), VSUB(T1i, T1j), ms, &(x[WS(rs, 1)]));
Chris@42 472 ST(&(x[WS(rs, 7)]), VADD(T1i, T1j), ms, &(x[WS(rs, 1)]));
Chris@42 473 }
Chris@42 474 {
Chris@42 475 V T1a, T1b, T1e, T1h;
Chris@42 476 T1a = VSUB(TY, T11);
Chris@42 477 T1b = VBYI(VADD(T18, T15));
Chris@42 478 ST(&(x[WS(rs, 11)]), VSUB(T1a, T1b), ms, &(x[WS(rs, 1)]));
Chris@42 479 ST(&(x[WS(rs, 9)]), VADD(T1a, T1b), ms, &(x[WS(rs, 1)]));
Chris@42 480 T1e = VSUB(T1c, T1d);
Chris@42 481 T1h = VBYI(VSUB(T1f, T1g));
Chris@42 482 ST(&(x[WS(rs, 17)]), VSUB(T1e, T1h), ms, &(x[WS(rs, 1)]));
Chris@42 483 ST(&(x[WS(rs, 3)]), VADD(T1e, T1h), ms, &(x[WS(rs, 1)]));
Chris@42 484 }
Chris@42 485 }
Chris@42 486 }
Chris@42 487 }
Chris@42 488 VLEAVE();
Chris@42 489 }
Chris@42 490
Chris@42 491 static const tw_instr twinstr[] = {
Chris@42 492 VTW(0, 1),
Chris@42 493 VTW(0, 2),
Chris@42 494 VTW(0, 3),
Chris@42 495 VTW(0, 4),
Chris@42 496 VTW(0, 5),
Chris@42 497 VTW(0, 6),
Chris@42 498 VTW(0, 7),
Chris@42 499 VTW(0, 8),
Chris@42 500 VTW(0, 9),
Chris@42 501 VTW(0, 10),
Chris@42 502 VTW(0, 11),
Chris@42 503 VTW(0, 12),
Chris@42 504 VTW(0, 13),
Chris@42 505 VTW(0, 14),
Chris@42 506 VTW(0, 15),
Chris@42 507 VTW(0, 16),
Chris@42 508 VTW(0, 17),
Chris@42 509 VTW(0, 18),
Chris@42 510 VTW(0, 19),
Chris@42 511 {TW_NEXT, VL, 0}
Chris@42 512 };
Chris@42 513
Chris@42 514 static const ct_desc desc = { 20, XSIMD_STRING("t1fv_20"), twinstr, &GENUS, {111, 50, 12, 0}, 0, 0, 0 };
Chris@42 515
Chris@42 516 void XSIMD(codelet_t1fv_20) (planner *p) {
Chris@42 517 X(kdft_dit_register) (p, t1fv_20, &desc);
Chris@42 518 }
Chris@42 519 #endif /* HAVE_FMA */