annotate src/fftw-3.3.8/dft/simd/common/t3fv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:54 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3fv_20 -include dft/simd/t3f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 138 FP additions, 118 FP multiplications,
Chris@82 32 * (or, 92 additions, 72 multiplications, 46 fused multiply/add),
Chris@82 33 * 73 stack variables, 4 constants, and 40 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t3f.h"
Chris@82 36
Chris@82 37 static void t3fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 R *x;
Chris@82 46 x = ri;
Chris@82 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@82 48 V T2, T8, T9, TA, T3, Tc, T4, TZ, T18, Tl, Tq, Tx, TU, Td, Te;
Chris@82 49 V T15, Ti, Tt, TJ;
Chris@82 50 T2 = LDW(&(W[0]));
Chris@82 51 T8 = LDW(&(W[TWVL * 2]));
Chris@82 52 T9 = VZMUL(T2, T8);
Chris@82 53 TA = VZMULJ(T2, T8);
Chris@82 54 T3 = LDW(&(W[TWVL * 4]));
Chris@82 55 Tc = VZMULJ(T9, T3);
Chris@82 56 T4 = VZMUL(T2, T3);
Chris@82 57 TZ = VZMUL(T9, T3);
Chris@82 58 T18 = VZMULJ(TA, T3);
Chris@82 59 Tl = VZMULJ(T8, T3);
Chris@82 60 Tq = VZMULJ(T2, T3);
Chris@82 61 Tx = VZMUL(T8, T3);
Chris@82 62 TU = VZMUL(TA, T3);
Chris@82 63 Td = LDW(&(W[TWVL * 6]));
Chris@82 64 Te = VZMULJ(Tc, Td);
Chris@82 65 T15 = VZMULJ(TA, Td);
Chris@82 66 Ti = VZMULJ(T8, Td);
Chris@82 67 Tt = VZMULJ(T2, Td);
Chris@82 68 TJ = VZMULJ(T9, Td);
Chris@82 69 {
Chris@82 70 V T7, TM, T1F, T23, T1i, T1p, T1q, T1j, Tp, TE, TF, T27, T28, T29, T1P;
Chris@82 71 V T1S, T1T, TY, T1c, T1d, T24, T25, T26, T1I, T1L, T1M, TG, T1e;
Chris@82 72 {
Chris@82 73 V T1, TL, T6, TI, TK, T5, TH, T1D, T1E;
Chris@82 74 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 75 TK = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 76 TL = VZMULJ(TJ, TK);
Chris@82 77 T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 78 T6 = VZMULJ(T4, T5);
Chris@82 79 TH = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 80 TI = VZMULJ(Tc, TH);
Chris@82 81 T7 = VSUB(T1, T6);
Chris@82 82 TM = VSUB(TI, TL);
Chris@82 83 T1D = VADD(T1, T6);
Chris@82 84 T1E = VADD(TI, TL);
Chris@82 85 T1F = VSUB(T1D, T1E);
Chris@82 86 T23 = VADD(T1D, T1E);
Chris@82 87 }
Chris@82 88 {
Chris@82 89 V Th, T1G, T14, T1O, T1b, T1R, To, T1J, Tw, T1N, TR, T1H, TX, T1K, TD;
Chris@82 90 V T1Q;
Chris@82 91 {
Chris@82 92 V Tb, Tg, Ta, Tf;
Chris@82 93 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 94 Tb = VZMULJ(T9, Ta);
Chris@82 95 Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 96 Tg = VZMULJ(Te, Tf);
Chris@82 97 Th = VSUB(Tb, Tg);
Chris@82 98 T1G = VADD(Tb, Tg);
Chris@82 99 }
Chris@82 100 {
Chris@82 101 V T11, T13, T10, T12;
Chris@82 102 T10 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 103 T11 = VZMULJ(TZ, T10);
Chris@82 104 T12 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 105 T13 = VZMULJ(T8, T12);
Chris@82 106 T14 = VSUB(T11, T13);
Chris@82 107 T1O = VADD(T11, T13);
Chris@82 108 }
Chris@82 109 {
Chris@82 110 V T17, T1a, T16, T19;
Chris@82 111 T16 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 112 T17 = VZMULJ(T15, T16);
Chris@82 113 T19 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 114 T1a = VZMULJ(T18, T19);
Chris@82 115 T1b = VSUB(T17, T1a);
Chris@82 116 T1R = VADD(T17, T1a);
Chris@82 117 }
Chris@82 118 {
Chris@82 119 V Tk, Tn, Tj, Tm;
Chris@82 120 Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 121 Tk = VZMULJ(Ti, Tj);
Chris@82 122 Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 123 Tn = VZMULJ(Tl, Tm);
Chris@82 124 To = VSUB(Tk, Tn);
Chris@82 125 T1J = VADD(Tk, Tn);
Chris@82 126 }
Chris@82 127 {
Chris@82 128 V Ts, Tv, Tr, Tu;
Chris@82 129 Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 130 Ts = VZMULJ(Tq, Tr);
Chris@82 131 Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 132 Tv = VZMULJ(Tt, Tu);
Chris@82 133 Tw = VSUB(Ts, Tv);
Chris@82 134 T1N = VADD(Ts, Tv);
Chris@82 135 }
Chris@82 136 {
Chris@82 137 V TO, TQ, TN, TP;
Chris@82 138 TN = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 139 TO = VZMULJ(T3, TN);
Chris@82 140 TP = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 141 TQ = VZMULJ(Td, TP);
Chris@82 142 TR = VSUB(TO, TQ);
Chris@82 143 T1H = VADD(TO, TQ);
Chris@82 144 }
Chris@82 145 {
Chris@82 146 V TT, TW, TS, TV;
Chris@82 147 TS = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 148 TT = VZMULJ(T2, TS);
Chris@82 149 TV = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 150 TW = VZMULJ(TU, TV);
Chris@82 151 TX = VSUB(TT, TW);
Chris@82 152 T1K = VADD(TT, TW);
Chris@82 153 }
Chris@82 154 {
Chris@82 155 V Tz, TC, Ty, TB;
Chris@82 156 Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 157 Tz = VZMULJ(Tx, Ty);
Chris@82 158 TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 159 TC = VZMULJ(TA, TB);
Chris@82 160 TD = VSUB(Tz, TC);
Chris@82 161 T1Q = VADD(Tz, TC);
Chris@82 162 }
Chris@82 163 T1i = VSUB(TX, TR);
Chris@82 164 T1p = VSUB(Th, To);
Chris@82 165 T1q = VSUB(Tw, TD);
Chris@82 166 T1j = VSUB(T1b, T14);
Chris@82 167 Tp = VADD(Th, To);
Chris@82 168 TE = VADD(Tw, TD);
Chris@82 169 TF = VADD(Tp, TE);
Chris@82 170 T27 = VADD(T1N, T1O);
Chris@82 171 T28 = VADD(T1Q, T1R);
Chris@82 172 T29 = VADD(T27, T28);
Chris@82 173 T1P = VSUB(T1N, T1O);
Chris@82 174 T1S = VSUB(T1Q, T1R);
Chris@82 175 T1T = VADD(T1P, T1S);
Chris@82 176 TY = VADD(TR, TX);
Chris@82 177 T1c = VADD(T14, T1b);
Chris@82 178 T1d = VADD(TY, T1c);
Chris@82 179 T24 = VADD(T1G, T1H);
Chris@82 180 T25 = VADD(T1J, T1K);
Chris@82 181 T26 = VADD(T24, T25);
Chris@82 182 T1I = VSUB(T1G, T1H);
Chris@82 183 T1L = VSUB(T1J, T1K);
Chris@82 184 T1M = VADD(T1I, T1L);
Chris@82 185 }
Chris@82 186 TG = VADD(T7, TF);
Chris@82 187 T1e = VADD(TM, T1d);
Chris@82 188 ST(&(x[WS(rs, 5)]), VFNMSI(T1e, TG), ms, &(x[WS(rs, 1)]));
Chris@82 189 ST(&(x[WS(rs, 15)]), VFMAI(T1e, TG), ms, &(x[WS(rs, 1)]));
Chris@82 190 {
Chris@82 191 V T2c, T2a, T2b, T2g, T2i, T2e, T2f, T2h, T2d;
Chris@82 192 T2c = VSUB(T26, T29);
Chris@82 193 T2a = VADD(T26, T29);
Chris@82 194 T2b = VFNMS(LDK(KP250000000), T2a, T23);
Chris@82 195 T2e = VSUB(T24, T25);
Chris@82 196 T2f = VSUB(T27, T28);
Chris@82 197 T2g = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T2f, T2e));
Chris@82 198 T2i = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T2e, T2f));
Chris@82 199 ST(&(x[0]), VADD(T23, T2a), ms, &(x[0]));
Chris@82 200 T2h = VFNMS(LDK(KP559016994), T2c, T2b);
Chris@82 201 ST(&(x[WS(rs, 8)]), VFNMSI(T2i, T2h), ms, &(x[0]));
Chris@82 202 ST(&(x[WS(rs, 12)]), VFMAI(T2i, T2h), ms, &(x[0]));
Chris@82 203 T2d = VFMA(LDK(KP559016994), T2c, T2b);
Chris@82 204 ST(&(x[WS(rs, 4)]), VFMAI(T2g, T2d), ms, &(x[0]));
Chris@82 205 ST(&(x[WS(rs, 16)]), VFNMSI(T2g, T2d), ms, &(x[0]));
Chris@82 206 }
Chris@82 207 {
Chris@82 208 V T1W, T1U, T1V, T20, T22, T1Y, T1Z, T21, T1X;
Chris@82 209 T1W = VSUB(T1M, T1T);
Chris@82 210 T1U = VADD(T1M, T1T);
Chris@82 211 T1V = VFNMS(LDK(KP250000000), T1U, T1F);
Chris@82 212 T1Y = VSUB(T1P, T1S);
Chris@82 213 T1Z = VSUB(T1I, T1L);
Chris@82 214 T20 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1Z, T1Y));
Chris@82 215 T22 = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1Y, T1Z));
Chris@82 216 ST(&(x[WS(rs, 10)]), VADD(T1F, T1U), ms, &(x[0]));
Chris@82 217 T21 = VFMA(LDK(KP559016994), T1W, T1V);
Chris@82 218 ST(&(x[WS(rs, 6)]), VFNMSI(T22, T21), ms, &(x[0]));
Chris@82 219 ST(&(x[WS(rs, 14)]), VFMAI(T22, T21), ms, &(x[0]));
Chris@82 220 T1X = VFNMS(LDK(KP559016994), T1W, T1V);
Chris@82 221 ST(&(x[WS(rs, 2)]), VFMAI(T20, T1X), ms, &(x[0]));
Chris@82 222 ST(&(x[WS(rs, 18)]), VFNMSI(T20, T1X), ms, &(x[0]));
Chris@82 223 }
Chris@82 224 {
Chris@82 225 V T1k, T1r, T1z, T1w, T1o, T1y, T1h, T1v;
Chris@82 226 T1k = VFMA(LDK(KP618033988), T1j, T1i);
Chris@82 227 T1r = VFMA(LDK(KP618033988), T1q, T1p);
Chris@82 228 T1z = VFNMS(LDK(KP618033988), T1p, T1q);
Chris@82 229 T1w = VFNMS(LDK(KP618033988), T1i, T1j);
Chris@82 230 {
Chris@82 231 V T1m, T1n, T1f, T1g;
Chris@82 232 T1m = VFNMS(LDK(KP250000000), T1d, TM);
Chris@82 233 T1n = VSUB(T1c, TY);
Chris@82 234 T1o = VFNMS(LDK(KP559016994), T1n, T1m);
Chris@82 235 T1y = VFMA(LDK(KP559016994), T1n, T1m);
Chris@82 236 T1f = VFNMS(LDK(KP250000000), TF, T7);
Chris@82 237 T1g = VSUB(Tp, TE);
Chris@82 238 T1h = VFMA(LDK(KP559016994), T1g, T1f);
Chris@82 239 T1v = VFNMS(LDK(KP559016994), T1g, T1f);
Chris@82 240 }
Chris@82 241 {
Chris@82 242 V T1l, T1s, T1B, T1C;
Chris@82 243 T1l = VFMA(LDK(KP951056516), T1k, T1h);
Chris@82 244 T1s = VFMA(LDK(KP951056516), T1r, T1o);
Chris@82 245 ST(&(x[WS(rs, 1)]), VFNMSI(T1s, T1l), ms, &(x[WS(rs, 1)]));
Chris@82 246 ST(&(x[WS(rs, 19)]), VFMAI(T1s, T1l), ms, &(x[WS(rs, 1)]));
Chris@82 247 T1B = VFMA(LDK(KP951056516), T1w, T1v);
Chris@82 248 T1C = VFMA(LDK(KP951056516), T1z, T1y);
Chris@82 249 ST(&(x[WS(rs, 13)]), VFNMSI(T1C, T1B), ms, &(x[WS(rs, 1)]));
Chris@82 250 ST(&(x[WS(rs, 7)]), VFMAI(T1C, T1B), ms, &(x[WS(rs, 1)]));
Chris@82 251 }
Chris@82 252 {
Chris@82 253 V T1t, T1u, T1x, T1A;
Chris@82 254 T1t = VFNMS(LDK(KP951056516), T1k, T1h);
Chris@82 255 T1u = VFNMS(LDK(KP951056516), T1r, T1o);
Chris@82 256 ST(&(x[WS(rs, 9)]), VFNMSI(T1u, T1t), ms, &(x[WS(rs, 1)]));
Chris@82 257 ST(&(x[WS(rs, 11)]), VFMAI(T1u, T1t), ms, &(x[WS(rs, 1)]));
Chris@82 258 T1x = VFNMS(LDK(KP951056516), T1w, T1v);
Chris@82 259 T1A = VFNMS(LDK(KP951056516), T1z, T1y);
Chris@82 260 ST(&(x[WS(rs, 17)]), VFNMSI(T1A, T1x), ms, &(x[WS(rs, 1)]));
Chris@82 261 ST(&(x[WS(rs, 3)]), VFMAI(T1A, T1x), ms, &(x[WS(rs, 1)]));
Chris@82 262 }
Chris@82 263 }
Chris@82 264 }
Chris@82 265 }
Chris@82 266 }
Chris@82 267 VLEAVE();
Chris@82 268 }
Chris@82 269
Chris@82 270 static const tw_instr twinstr[] = {
Chris@82 271 VTW(0, 1),
Chris@82 272 VTW(0, 3),
Chris@82 273 VTW(0, 9),
Chris@82 274 VTW(0, 19),
Chris@82 275 {TW_NEXT, VL, 0}
Chris@82 276 };
Chris@82 277
Chris@82 278 static const ct_desc desc = { 20, XSIMD_STRING("t3fv_20"), twinstr, &GENUS, {92, 72, 46, 0}, 0, 0, 0 };
Chris@82 279
Chris@82 280 void XSIMD(codelet_t3fv_20) (planner *p) {
Chris@82 281 X(kdft_dit_register) (p, t3fv_20, &desc);
Chris@82 282 }
Chris@82 283 #else
Chris@82 284
Chris@82 285 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3fv_20 -include dft/simd/t3f.h */
Chris@82 286
Chris@82 287 /*
Chris@82 288 * This function contains 138 FP additions, 92 FP multiplications,
Chris@82 289 * (or, 126 additions, 80 multiplications, 12 fused multiply/add),
Chris@82 290 * 73 stack variables, 4 constants, and 40 memory accesses
Chris@82 291 */
Chris@82 292 #include "dft/simd/t3f.h"
Chris@82 293
Chris@82 294 static void t3fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 295 {
Chris@82 296 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 297 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 298 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 299 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 300 {
Chris@82 301 INT m;
Chris@82 302 R *x;
Chris@82 303 x = ri;
Chris@82 304 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@82 305 V T2, T8, T9, TA, T3, Tc, T4, TZ, T18, Tl, Tq, Tx, TU, Td, Te;
Chris@82 306 V T15, Ti, Tt, TJ;
Chris@82 307 T2 = LDW(&(W[0]));
Chris@82 308 T8 = LDW(&(W[TWVL * 2]));
Chris@82 309 T9 = VZMUL(T2, T8);
Chris@82 310 TA = VZMULJ(T2, T8);
Chris@82 311 T3 = LDW(&(W[TWVL * 4]));
Chris@82 312 Tc = VZMULJ(T9, T3);
Chris@82 313 T4 = VZMUL(T2, T3);
Chris@82 314 TZ = VZMUL(T9, T3);
Chris@82 315 T18 = VZMULJ(TA, T3);
Chris@82 316 Tl = VZMULJ(T8, T3);
Chris@82 317 Tq = VZMULJ(T2, T3);
Chris@82 318 Tx = VZMUL(T8, T3);
Chris@82 319 TU = VZMUL(TA, T3);
Chris@82 320 Td = LDW(&(W[TWVL * 6]));
Chris@82 321 Te = VZMULJ(Tc, Td);
Chris@82 322 T15 = VZMULJ(TA, Td);
Chris@82 323 Ti = VZMULJ(T8, Td);
Chris@82 324 Tt = VZMULJ(T2, Td);
Chris@82 325 TJ = VZMULJ(T9, Td);
Chris@82 326 {
Chris@82 327 V T7, TM, T1U, T2d, T1i, T1p, T1q, T1j, Tp, TE, TF, T26, T27, T2b, T1M;
Chris@82 328 V T1P, T1V, TY, T1c, T1d, T23, T24, T2a, T1F, T1I, T1W, TG, T1e;
Chris@82 329 {
Chris@82 330 V T1, TL, T6, TI, TK, T5, TH, T1S, T1T;
Chris@82 331 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 332 TK = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 333 TL = VZMULJ(TJ, TK);
Chris@82 334 T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 335 T6 = VZMULJ(T4, T5);
Chris@82 336 TH = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 337 TI = VZMULJ(Tc, TH);
Chris@82 338 T7 = VSUB(T1, T6);
Chris@82 339 TM = VSUB(TI, TL);
Chris@82 340 T1S = VADD(T1, T6);
Chris@82 341 T1T = VADD(TI, TL);
Chris@82 342 T1U = VSUB(T1S, T1T);
Chris@82 343 T2d = VADD(T1S, T1T);
Chris@82 344 }
Chris@82 345 {
Chris@82 346 V Th, T1K, T14, T1E, T1b, T1H, To, T1N, Tw, T1D, TR, T1L, TX, T1O, TD;
Chris@82 347 V T1G;
Chris@82 348 {
Chris@82 349 V Tb, Tg, Ta, Tf;
Chris@82 350 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 351 Tb = VZMULJ(T9, Ta);
Chris@82 352 Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 353 Tg = VZMULJ(Te, Tf);
Chris@82 354 Th = VSUB(Tb, Tg);
Chris@82 355 T1K = VADD(Tb, Tg);
Chris@82 356 }
Chris@82 357 {
Chris@82 358 V T11, T13, T10, T12;
Chris@82 359 T10 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 360 T11 = VZMULJ(TZ, T10);
Chris@82 361 T12 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 362 T13 = VZMULJ(T8, T12);
Chris@82 363 T14 = VSUB(T11, T13);
Chris@82 364 T1E = VADD(T11, T13);
Chris@82 365 }
Chris@82 366 {
Chris@82 367 V T17, T1a, T16, T19;
Chris@82 368 T16 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 369 T17 = VZMULJ(T15, T16);
Chris@82 370 T19 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 371 T1a = VZMULJ(T18, T19);
Chris@82 372 T1b = VSUB(T17, T1a);
Chris@82 373 T1H = VADD(T17, T1a);
Chris@82 374 }
Chris@82 375 {
Chris@82 376 V Tk, Tn, Tj, Tm;
Chris@82 377 Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 378 Tk = VZMULJ(Ti, Tj);
Chris@82 379 Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 380 Tn = VZMULJ(Tl, Tm);
Chris@82 381 To = VSUB(Tk, Tn);
Chris@82 382 T1N = VADD(Tk, Tn);
Chris@82 383 }
Chris@82 384 {
Chris@82 385 V Ts, Tv, Tr, Tu;
Chris@82 386 Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 387 Ts = VZMULJ(Tq, Tr);
Chris@82 388 Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 389 Tv = VZMULJ(Tt, Tu);
Chris@82 390 Tw = VSUB(Ts, Tv);
Chris@82 391 T1D = VADD(Ts, Tv);
Chris@82 392 }
Chris@82 393 {
Chris@82 394 V TO, TQ, TN, TP;
Chris@82 395 TN = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 396 TO = VZMULJ(T3, TN);
Chris@82 397 TP = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 398 TQ = VZMULJ(Td, TP);
Chris@82 399 TR = VSUB(TO, TQ);
Chris@82 400 T1L = VADD(TO, TQ);
Chris@82 401 }
Chris@82 402 {
Chris@82 403 V TT, TW, TS, TV;
Chris@82 404 TS = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 405 TT = VZMULJ(T2, TS);
Chris@82 406 TV = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 407 TW = VZMULJ(TU, TV);
Chris@82 408 TX = VSUB(TT, TW);
Chris@82 409 T1O = VADD(TT, TW);
Chris@82 410 }
Chris@82 411 {
Chris@82 412 V Tz, TC, Ty, TB;
Chris@82 413 Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 414 Tz = VZMULJ(Tx, Ty);
Chris@82 415 TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 416 TC = VZMULJ(TA, TB);
Chris@82 417 TD = VSUB(Tz, TC);
Chris@82 418 T1G = VADD(Tz, TC);
Chris@82 419 }
Chris@82 420 T1i = VSUB(TX, TR);
Chris@82 421 T1p = VSUB(Th, To);
Chris@82 422 T1q = VSUB(Tw, TD);
Chris@82 423 T1j = VSUB(T1b, T14);
Chris@82 424 Tp = VADD(Th, To);
Chris@82 425 TE = VADD(Tw, TD);
Chris@82 426 TF = VADD(Tp, TE);
Chris@82 427 T26 = VADD(T1D, T1E);
Chris@82 428 T27 = VADD(T1G, T1H);
Chris@82 429 T2b = VADD(T26, T27);
Chris@82 430 T1M = VSUB(T1K, T1L);
Chris@82 431 T1P = VSUB(T1N, T1O);
Chris@82 432 T1V = VADD(T1M, T1P);
Chris@82 433 TY = VADD(TR, TX);
Chris@82 434 T1c = VADD(T14, T1b);
Chris@82 435 T1d = VADD(TY, T1c);
Chris@82 436 T23 = VADD(T1K, T1L);
Chris@82 437 T24 = VADD(T1N, T1O);
Chris@82 438 T2a = VADD(T23, T24);
Chris@82 439 T1F = VSUB(T1D, T1E);
Chris@82 440 T1I = VSUB(T1G, T1H);
Chris@82 441 T1W = VADD(T1F, T1I);
Chris@82 442 }
Chris@82 443 TG = VADD(T7, TF);
Chris@82 444 T1e = VBYI(VADD(TM, T1d));
Chris@82 445 ST(&(x[WS(rs, 5)]), VSUB(TG, T1e), ms, &(x[WS(rs, 1)]));
Chris@82 446 ST(&(x[WS(rs, 15)]), VADD(TG, T1e), ms, &(x[WS(rs, 1)]));
Chris@82 447 {
Chris@82 448 V T2c, T2e, T2f, T29, T2i, T25, T28, T2h, T2g;
Chris@82 449 T2c = VMUL(LDK(KP559016994), VSUB(T2a, T2b));
Chris@82 450 T2e = VADD(T2a, T2b);
Chris@82 451 T2f = VFNMS(LDK(KP250000000), T2e, T2d);
Chris@82 452 T25 = VSUB(T23, T24);
Chris@82 453 T28 = VSUB(T26, T27);
Chris@82 454 T29 = VBYI(VFMA(LDK(KP951056516), T25, VMUL(LDK(KP587785252), T28)));
Chris@82 455 T2i = VBYI(VFNMS(LDK(KP587785252), T25, VMUL(LDK(KP951056516), T28)));
Chris@82 456 ST(&(x[0]), VADD(T2d, T2e), ms, &(x[0]));
Chris@82 457 T2h = VSUB(T2f, T2c);
Chris@82 458 ST(&(x[WS(rs, 8)]), VSUB(T2h, T2i), ms, &(x[0]));
Chris@82 459 ST(&(x[WS(rs, 12)]), VADD(T2i, T2h), ms, &(x[0]));
Chris@82 460 T2g = VADD(T2c, T2f);
Chris@82 461 ST(&(x[WS(rs, 4)]), VADD(T29, T2g), ms, &(x[0]));
Chris@82 462 ST(&(x[WS(rs, 16)]), VSUB(T2g, T29), ms, &(x[0]));
Chris@82 463 }
Chris@82 464 {
Chris@82 465 V T1Z, T1X, T1Y, T1R, T22, T1J, T1Q, T21, T20;
Chris@82 466 T1Z = VMUL(LDK(KP559016994), VSUB(T1V, T1W));
Chris@82 467 T1X = VADD(T1V, T1W);
Chris@82 468 T1Y = VFNMS(LDK(KP250000000), T1X, T1U);
Chris@82 469 T1J = VSUB(T1F, T1I);
Chris@82 470 T1Q = VSUB(T1M, T1P);
Chris@82 471 T1R = VBYI(VFNMS(LDK(KP587785252), T1Q, VMUL(LDK(KP951056516), T1J)));
Chris@82 472 T22 = VBYI(VFMA(LDK(KP951056516), T1Q, VMUL(LDK(KP587785252), T1J)));
Chris@82 473 ST(&(x[WS(rs, 10)]), VADD(T1U, T1X), ms, &(x[0]));
Chris@82 474 T21 = VADD(T1Z, T1Y);
Chris@82 475 ST(&(x[WS(rs, 6)]), VSUB(T21, T22), ms, &(x[0]));
Chris@82 476 ST(&(x[WS(rs, 14)]), VADD(T22, T21), ms, &(x[0]));
Chris@82 477 T20 = VSUB(T1Y, T1Z);
Chris@82 478 ST(&(x[WS(rs, 2)]), VADD(T1R, T20), ms, &(x[0]));
Chris@82 479 ST(&(x[WS(rs, 18)]), VSUB(T20, T1R), ms, &(x[0]));
Chris@82 480 }
Chris@82 481 {
Chris@82 482 V T1k, T1r, T1z, T1w, T1o, T1y, T1h, T1v;
Chris@82 483 T1k = VFMA(LDK(KP951056516), T1i, VMUL(LDK(KP587785252), T1j));
Chris@82 484 T1r = VFMA(LDK(KP951056516), T1p, VMUL(LDK(KP587785252), T1q));
Chris@82 485 T1z = VFNMS(LDK(KP587785252), T1p, VMUL(LDK(KP951056516), T1q));
Chris@82 486 T1w = VFNMS(LDK(KP587785252), T1i, VMUL(LDK(KP951056516), T1j));
Chris@82 487 {
Chris@82 488 V T1m, T1n, T1f, T1g;
Chris@82 489 T1m = VFMS(LDK(KP250000000), T1d, TM);
Chris@82 490 T1n = VMUL(LDK(KP559016994), VSUB(T1c, TY));
Chris@82 491 T1o = VADD(T1m, T1n);
Chris@82 492 T1y = VSUB(T1n, T1m);
Chris@82 493 T1f = VMUL(LDK(KP559016994), VSUB(Tp, TE));
Chris@82 494 T1g = VFNMS(LDK(KP250000000), TF, T7);
Chris@82 495 T1h = VADD(T1f, T1g);
Chris@82 496 T1v = VSUB(T1g, T1f);
Chris@82 497 }
Chris@82 498 {
Chris@82 499 V T1l, T1s, T1B, T1C;
Chris@82 500 T1l = VADD(T1h, T1k);
Chris@82 501 T1s = VBYI(VSUB(T1o, T1r));
Chris@82 502 ST(&(x[WS(rs, 19)]), VSUB(T1l, T1s), ms, &(x[WS(rs, 1)]));
Chris@82 503 ST(&(x[WS(rs, 1)]), VADD(T1l, T1s), ms, &(x[WS(rs, 1)]));
Chris@82 504 T1B = VADD(T1v, T1w);
Chris@82 505 T1C = VBYI(VADD(T1z, T1y));
Chris@82 506 ST(&(x[WS(rs, 13)]), VSUB(T1B, T1C), ms, &(x[WS(rs, 1)]));
Chris@82 507 ST(&(x[WS(rs, 7)]), VADD(T1B, T1C), ms, &(x[WS(rs, 1)]));
Chris@82 508 }
Chris@82 509 {
Chris@82 510 V T1t, T1u, T1x, T1A;
Chris@82 511 T1t = VSUB(T1h, T1k);
Chris@82 512 T1u = VBYI(VADD(T1r, T1o));
Chris@82 513 ST(&(x[WS(rs, 11)]), VSUB(T1t, T1u), ms, &(x[WS(rs, 1)]));
Chris@82 514 ST(&(x[WS(rs, 9)]), VADD(T1t, T1u), ms, &(x[WS(rs, 1)]));
Chris@82 515 T1x = VSUB(T1v, T1w);
Chris@82 516 T1A = VBYI(VSUB(T1y, T1z));
Chris@82 517 ST(&(x[WS(rs, 17)]), VSUB(T1x, T1A), ms, &(x[WS(rs, 1)]));
Chris@82 518 ST(&(x[WS(rs, 3)]), VADD(T1x, T1A), ms, &(x[WS(rs, 1)]));
Chris@82 519 }
Chris@82 520 }
Chris@82 521 }
Chris@82 522 }
Chris@82 523 }
Chris@82 524 VLEAVE();
Chris@82 525 }
Chris@82 526
Chris@82 527 static const tw_instr twinstr[] = {
Chris@82 528 VTW(0, 1),
Chris@82 529 VTW(0, 3),
Chris@82 530 VTW(0, 9),
Chris@82 531 VTW(0, 19),
Chris@82 532 {TW_NEXT, VL, 0}
Chris@82 533 };
Chris@82 534
Chris@82 535 static const ct_desc desc = { 20, XSIMD_STRING("t3fv_20"), twinstr, &GENUS, {126, 80, 12, 0}, 0, 0, 0 };
Chris@82 536
Chris@82 537 void XSIMD(codelet_t3fv_20) (planner *p) {
Chris@82 538 X(kdft_dit_register) (p, t3fv_20, &desc);
Chris@82 539 }
Chris@82 540 #endif