annotate src/fftw-3.3.8/dft/simd/common/t3bv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:09 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3bv_20 -include dft/simd/t3b.h -sign 1 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 138 FP additions, 118 FP multiplications,
Chris@82 32 * (or, 92 additions, 72 multiplications, 46 fused multiply/add),
Chris@82 33 * 73 stack variables, 4 constants, and 40 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t3b.h"
Chris@82 36
Chris@82 37 static void t3bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 R *x;
Chris@82 46 x = ii;
Chris@82 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@82 48 V T2, T8, T9, TA, T3, Tc, T4, TV, T14, Tl, Tq, Tx, TQ, Td, Te;
Chris@82 49 V T1d, Ti, Tt, T11;
Chris@82 50 T2 = LDW(&(W[0]));
Chris@82 51 T8 = LDW(&(W[TWVL * 2]));
Chris@82 52 T9 = VZMUL(T2, T8);
Chris@82 53 TA = VZMULJ(T2, T8);
Chris@82 54 T3 = LDW(&(W[TWVL * 4]));
Chris@82 55 Tc = VZMULJ(T9, T3);
Chris@82 56 T4 = VZMUL(T2, T3);
Chris@82 57 TV = VZMUL(T9, T3);
Chris@82 58 T14 = VZMULJ(TA, T3);
Chris@82 59 Tl = VZMULJ(T8, T3);
Chris@82 60 Tq = VZMULJ(T2, T3);
Chris@82 61 Tx = VZMUL(T8, T3);
Chris@82 62 TQ = VZMUL(TA, T3);
Chris@82 63 Td = LDW(&(W[TWVL * 6]));
Chris@82 64 Te = VZMULJ(Tc, Td);
Chris@82 65 T1d = VZMULJ(T9, Td);
Chris@82 66 Ti = VZMULJ(T8, Td);
Chris@82 67 Tt = VZMULJ(T2, Td);
Chris@82 68 T11 = VZMULJ(TA, Td);
Chris@82 69 {
Chris@82 70 V T7, T1g, T1F, T23, TU, T1n, T1o, T18, Tp, TE, TF, T27, T28, T29, T1P;
Chris@82 71 V T1S, T1T, T1h, T1i, T1j, T24, T25, T26, T1I, T1L, T1M, T1B, T1C;
Chris@82 72 {
Chris@82 73 V T1, T1f, T6, T1c, T1e, T5, T1b, T1D, T1E;
Chris@82 74 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 75 T1e = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 76 T1f = VZMUL(T1d, T1e);
Chris@82 77 T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 78 T6 = VZMUL(T4, T5);
Chris@82 79 T1b = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 80 T1c = VZMUL(Tc, T1b);
Chris@82 81 T7 = VSUB(T1, T6);
Chris@82 82 T1g = VSUB(T1c, T1f);
Chris@82 83 T1D = VADD(T1, T6);
Chris@82 84 T1E = VADD(T1c, T1f);
Chris@82 85 T1F = VSUB(T1D, T1E);
Chris@82 86 T23 = VADD(T1D, T1E);
Chris@82 87 }
Chris@82 88 {
Chris@82 89 V Th, T1G, T10, T1O, T17, T1R, To, T1J, Tw, T1N, TN, T1H, TT, T1K, TD;
Chris@82 90 V T1Q;
Chris@82 91 {
Chris@82 92 V Tb, Tg, Ta, Tf;
Chris@82 93 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 94 Tb = VZMUL(T9, Ta);
Chris@82 95 Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 96 Tg = VZMUL(Te, Tf);
Chris@82 97 Th = VSUB(Tb, Tg);
Chris@82 98 T1G = VADD(Tb, Tg);
Chris@82 99 }
Chris@82 100 {
Chris@82 101 V TX, TZ, TW, TY;
Chris@82 102 TW = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 103 TX = VZMUL(TV, TW);
Chris@82 104 TY = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 105 TZ = VZMUL(T8, TY);
Chris@82 106 T10 = VSUB(TX, TZ);
Chris@82 107 T1O = VADD(TX, TZ);
Chris@82 108 }
Chris@82 109 {
Chris@82 110 V T13, T16, T12, T15;
Chris@82 111 T12 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 112 T13 = VZMUL(T11, T12);
Chris@82 113 T15 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 114 T16 = VZMUL(T14, T15);
Chris@82 115 T17 = VSUB(T13, T16);
Chris@82 116 T1R = VADD(T13, T16);
Chris@82 117 }
Chris@82 118 {
Chris@82 119 V Tk, Tn, Tj, Tm;
Chris@82 120 Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 121 Tk = VZMUL(Ti, Tj);
Chris@82 122 Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 123 Tn = VZMUL(Tl, Tm);
Chris@82 124 To = VSUB(Tk, Tn);
Chris@82 125 T1J = VADD(Tk, Tn);
Chris@82 126 }
Chris@82 127 {
Chris@82 128 V Ts, Tv, Tr, Tu;
Chris@82 129 Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 130 Ts = VZMUL(Tq, Tr);
Chris@82 131 Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 132 Tv = VZMUL(Tt, Tu);
Chris@82 133 Tw = VSUB(Ts, Tv);
Chris@82 134 T1N = VADD(Ts, Tv);
Chris@82 135 }
Chris@82 136 {
Chris@82 137 V TK, TM, TJ, TL;
Chris@82 138 TJ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 139 TK = VZMUL(T3, TJ);
Chris@82 140 TL = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 141 TM = VZMUL(Td, TL);
Chris@82 142 TN = VSUB(TK, TM);
Chris@82 143 T1H = VADD(TK, TM);
Chris@82 144 }
Chris@82 145 {
Chris@82 146 V TP, TS, TO, TR;
Chris@82 147 TO = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 148 TP = VZMUL(T2, TO);
Chris@82 149 TR = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 150 TS = VZMUL(TQ, TR);
Chris@82 151 TT = VSUB(TP, TS);
Chris@82 152 T1K = VADD(TP, TS);
Chris@82 153 }
Chris@82 154 {
Chris@82 155 V Tz, TC, Ty, TB;
Chris@82 156 Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 157 Tz = VZMUL(Tx, Ty);
Chris@82 158 TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 159 TC = VZMUL(TA, TB);
Chris@82 160 TD = VSUB(Tz, TC);
Chris@82 161 T1Q = VADD(Tz, TC);
Chris@82 162 }
Chris@82 163 TU = VSUB(TN, TT);
Chris@82 164 T1n = VSUB(Th, To);
Chris@82 165 T1o = VSUB(Tw, TD);
Chris@82 166 T18 = VSUB(T10, T17);
Chris@82 167 Tp = VADD(Th, To);
Chris@82 168 TE = VADD(Tw, TD);
Chris@82 169 TF = VADD(Tp, TE);
Chris@82 170 T27 = VADD(T1N, T1O);
Chris@82 171 T28 = VADD(T1Q, T1R);
Chris@82 172 T29 = VADD(T27, T28);
Chris@82 173 T1P = VSUB(T1N, T1O);
Chris@82 174 T1S = VSUB(T1Q, T1R);
Chris@82 175 T1T = VADD(T1P, T1S);
Chris@82 176 T1h = VADD(TN, TT);
Chris@82 177 T1i = VADD(T10, T17);
Chris@82 178 T1j = VADD(T1h, T1i);
Chris@82 179 T24 = VADD(T1G, T1H);
Chris@82 180 T25 = VADD(T1J, T1K);
Chris@82 181 T26 = VADD(T24, T25);
Chris@82 182 T1I = VSUB(T1G, T1H);
Chris@82 183 T1L = VSUB(T1J, T1K);
Chris@82 184 T1M = VADD(T1I, T1L);
Chris@82 185 }
Chris@82 186 T1B = VADD(T7, TF);
Chris@82 187 T1C = VADD(T1g, T1j);
Chris@82 188 ST(&(x[WS(rs, 15)]), VFNMSI(T1C, T1B), ms, &(x[WS(rs, 1)]));
Chris@82 189 ST(&(x[WS(rs, 5)]), VFMAI(T1C, T1B), ms, &(x[WS(rs, 1)]));
Chris@82 190 {
Chris@82 191 V T2c, T2a, T2b, T2g, T2i, T2e, T2f, T2h, T2d;
Chris@82 192 T2c = VSUB(T26, T29);
Chris@82 193 T2a = VADD(T26, T29);
Chris@82 194 T2b = VFNMS(LDK(KP250000000), T2a, T23);
Chris@82 195 T2e = VSUB(T24, T25);
Chris@82 196 T2f = VSUB(T27, T28);
Chris@82 197 T2g = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T2f, T2e));
Chris@82 198 T2i = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T2e, T2f));
Chris@82 199 ST(&(x[0]), VADD(T23, T2a), ms, &(x[0]));
Chris@82 200 T2h = VFNMS(LDK(KP559016994), T2c, T2b);
Chris@82 201 ST(&(x[WS(rs, 8)]), VFMAI(T2i, T2h), ms, &(x[0]));
Chris@82 202 ST(&(x[WS(rs, 12)]), VFNMSI(T2i, T2h), ms, &(x[0]));
Chris@82 203 T2d = VFMA(LDK(KP559016994), T2c, T2b);
Chris@82 204 ST(&(x[WS(rs, 4)]), VFNMSI(T2g, T2d), ms, &(x[0]));
Chris@82 205 ST(&(x[WS(rs, 16)]), VFMAI(T2g, T2d), ms, &(x[0]));
Chris@82 206 }
Chris@82 207 {
Chris@82 208 V T1W, T1U, T1V, T20, T22, T1Y, T1Z, T21, T1X;
Chris@82 209 T1W = VSUB(T1M, T1T);
Chris@82 210 T1U = VADD(T1M, T1T);
Chris@82 211 T1V = VFNMS(LDK(KP250000000), T1U, T1F);
Chris@82 212 T1Y = VSUB(T1P, T1S);
Chris@82 213 T1Z = VSUB(T1I, T1L);
Chris@82 214 T20 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1Z, T1Y));
Chris@82 215 T22 = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1Y, T1Z));
Chris@82 216 ST(&(x[WS(rs, 10)]), VADD(T1F, T1U), ms, &(x[0]));
Chris@82 217 T21 = VFMA(LDK(KP559016994), T1W, T1V);
Chris@82 218 ST(&(x[WS(rs, 6)]), VFMAI(T22, T21), ms, &(x[0]));
Chris@82 219 ST(&(x[WS(rs, 14)]), VFNMSI(T22, T21), ms, &(x[0]));
Chris@82 220 T1X = VFNMS(LDK(KP559016994), T1W, T1V);
Chris@82 221 ST(&(x[WS(rs, 2)]), VFNMSI(T20, T1X), ms, &(x[0]));
Chris@82 222 ST(&(x[WS(rs, 18)]), VFMAI(T20, T1X), ms, &(x[0]));
Chris@82 223 }
Chris@82 224 {
Chris@82 225 V T19, T1p, T1x, T1u, T1m, T1w, TI, T1t;
Chris@82 226 T19 = VFMA(LDK(KP618033988), T18, TU);
Chris@82 227 T1p = VFMA(LDK(KP618033988), T1o, T1n);
Chris@82 228 T1x = VFNMS(LDK(KP618033988), T1n, T1o);
Chris@82 229 T1u = VFNMS(LDK(KP618033988), TU, T18);
Chris@82 230 {
Chris@82 231 V T1k, T1l, TG, TH;
Chris@82 232 T1k = VFNMS(LDK(KP250000000), T1j, T1g);
Chris@82 233 T1l = VSUB(T1h, T1i);
Chris@82 234 T1m = VFMA(LDK(KP559016994), T1l, T1k);
Chris@82 235 T1w = VFNMS(LDK(KP559016994), T1l, T1k);
Chris@82 236 TG = VFNMS(LDK(KP250000000), TF, T7);
Chris@82 237 TH = VSUB(Tp, TE);
Chris@82 238 TI = VFMA(LDK(KP559016994), TH, TG);
Chris@82 239 T1t = VFNMS(LDK(KP559016994), TH, TG);
Chris@82 240 }
Chris@82 241 {
Chris@82 242 V T1a, T1q, T1z, T1A;
Chris@82 243 T1a = VFNMS(LDK(KP951056516), T19, TI);
Chris@82 244 T1q = VFMA(LDK(KP951056516), T1p, T1m);
Chris@82 245 ST(&(x[WS(rs, 19)]), VFNMSI(T1q, T1a), ms, &(x[WS(rs, 1)]));
Chris@82 246 ST(&(x[WS(rs, 1)]), VFMAI(T1q, T1a), ms, &(x[WS(rs, 1)]));
Chris@82 247 T1z = VFNMS(LDK(KP951056516), T1u, T1t);
Chris@82 248 T1A = VFMA(LDK(KP951056516), T1x, T1w);
Chris@82 249 ST(&(x[WS(rs, 7)]), VFNMSI(T1A, T1z), ms, &(x[WS(rs, 1)]));
Chris@82 250 ST(&(x[WS(rs, 13)]), VFMAI(T1A, T1z), ms, &(x[WS(rs, 1)]));
Chris@82 251 }
Chris@82 252 {
Chris@82 253 V T1r, T1s, T1v, T1y;
Chris@82 254 T1r = VFMA(LDK(KP951056516), T19, TI);
Chris@82 255 T1s = VFNMS(LDK(KP951056516), T1p, T1m);
Chris@82 256 ST(&(x[WS(rs, 11)]), VFNMSI(T1s, T1r), ms, &(x[WS(rs, 1)]));
Chris@82 257 ST(&(x[WS(rs, 9)]), VFMAI(T1s, T1r), ms, &(x[WS(rs, 1)]));
Chris@82 258 T1v = VFMA(LDK(KP951056516), T1u, T1t);
Chris@82 259 T1y = VFNMS(LDK(KP951056516), T1x, T1w);
Chris@82 260 ST(&(x[WS(rs, 3)]), VFNMSI(T1y, T1v), ms, &(x[WS(rs, 1)]));
Chris@82 261 ST(&(x[WS(rs, 17)]), VFMAI(T1y, T1v), ms, &(x[WS(rs, 1)]));
Chris@82 262 }
Chris@82 263 }
Chris@82 264 }
Chris@82 265 }
Chris@82 266 }
Chris@82 267 VLEAVE();
Chris@82 268 }
Chris@82 269
Chris@82 270 static const tw_instr twinstr[] = {
Chris@82 271 VTW(0, 1),
Chris@82 272 VTW(0, 3),
Chris@82 273 VTW(0, 9),
Chris@82 274 VTW(0, 19),
Chris@82 275 {TW_NEXT, VL, 0}
Chris@82 276 };
Chris@82 277
Chris@82 278 static const ct_desc desc = { 20, XSIMD_STRING("t3bv_20"), twinstr, &GENUS, {92, 72, 46, 0}, 0, 0, 0 };
Chris@82 279
Chris@82 280 void XSIMD(codelet_t3bv_20) (planner *p) {
Chris@82 281 X(kdft_dit_register) (p, t3bv_20, &desc);
Chris@82 282 }
Chris@82 283 #else
Chris@82 284
Chris@82 285 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3bv_20 -include dft/simd/t3b.h -sign 1 */
Chris@82 286
Chris@82 287 /*
Chris@82 288 * This function contains 138 FP additions, 92 FP multiplications,
Chris@82 289 * (or, 126 additions, 80 multiplications, 12 fused multiply/add),
Chris@82 290 * 73 stack variables, 4 constants, and 40 memory accesses
Chris@82 291 */
Chris@82 292 #include "dft/simd/t3b.h"
Chris@82 293
Chris@82 294 static void t3bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 295 {
Chris@82 296 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 297 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 298 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 299 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 300 {
Chris@82 301 INT m;
Chris@82 302 R *x;
Chris@82 303 x = ii;
Chris@82 304 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@82 305 V T2, T8, T9, TA, T3, Tc, T4, TV, T14, Tl, Tq, Tx, TQ, Td, Te;
Chris@82 306 V T1g, Ti, Tt, T11;
Chris@82 307 T2 = LDW(&(W[0]));
Chris@82 308 T8 = LDW(&(W[TWVL * 2]));
Chris@82 309 T9 = VZMUL(T2, T8);
Chris@82 310 TA = VZMULJ(T2, T8);
Chris@82 311 T3 = LDW(&(W[TWVL * 4]));
Chris@82 312 Tc = VZMULJ(T9, T3);
Chris@82 313 T4 = VZMUL(T2, T3);
Chris@82 314 TV = VZMUL(T9, T3);
Chris@82 315 T14 = VZMULJ(TA, T3);
Chris@82 316 Tl = VZMULJ(T8, T3);
Chris@82 317 Tq = VZMULJ(T2, T3);
Chris@82 318 Tx = VZMUL(T8, T3);
Chris@82 319 TQ = VZMUL(TA, T3);
Chris@82 320 Td = LDW(&(W[TWVL * 6]));
Chris@82 321 Te = VZMULJ(Tc, Td);
Chris@82 322 T1g = VZMULJ(T9, Td);
Chris@82 323 Ti = VZMULJ(T8, Td);
Chris@82 324 Tt = VZMULJ(T2, Td);
Chris@82 325 T11 = VZMULJ(TA, Td);
Chris@82 326 {
Chris@82 327 V T7, T1j, T1U, T2a, TU, T1n, T1o, T18, Tp, TE, TF, T26, T27, T28, T1M;
Chris@82 328 V T1P, T1W, T1b, T1c, T1k, T23, T24, T25, T1F, T1I, T1V, T1B, T1C;
Chris@82 329 {
Chris@82 330 V T1, T1i, T6, T1f, T1h, T5, T1e, T1S, T1T;
Chris@82 331 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 332 T1h = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 333 T1i = VZMUL(T1g, T1h);
Chris@82 334 T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 335 T6 = VZMUL(T4, T5);
Chris@82 336 T1e = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 337 T1f = VZMUL(Tc, T1e);
Chris@82 338 T7 = VSUB(T1, T6);
Chris@82 339 T1j = VSUB(T1f, T1i);
Chris@82 340 T1S = VADD(T1, T6);
Chris@82 341 T1T = VADD(T1f, T1i);
Chris@82 342 T1U = VSUB(T1S, T1T);
Chris@82 343 T2a = VADD(T1S, T1T);
Chris@82 344 }
Chris@82 345 {
Chris@82 346 V Th, T1D, T10, T1L, T17, T1O, To, T1G, Tw, T1K, TN, T1E, TT, T1H, TD;
Chris@82 347 V T1N;
Chris@82 348 {
Chris@82 349 V Tb, Tg, Ta, Tf;
Chris@82 350 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 351 Tb = VZMUL(T9, Ta);
Chris@82 352 Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 353 Tg = VZMUL(Te, Tf);
Chris@82 354 Th = VSUB(Tb, Tg);
Chris@82 355 T1D = VADD(Tb, Tg);
Chris@82 356 }
Chris@82 357 {
Chris@82 358 V TX, TZ, TW, TY;
Chris@82 359 TW = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 360 TX = VZMUL(TV, TW);
Chris@82 361 TY = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 362 TZ = VZMUL(T8, TY);
Chris@82 363 T10 = VSUB(TX, TZ);
Chris@82 364 T1L = VADD(TX, TZ);
Chris@82 365 }
Chris@82 366 {
Chris@82 367 V T13, T16, T12, T15;
Chris@82 368 T12 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 369 T13 = VZMUL(T11, T12);
Chris@82 370 T15 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 371 T16 = VZMUL(T14, T15);
Chris@82 372 T17 = VSUB(T13, T16);
Chris@82 373 T1O = VADD(T13, T16);
Chris@82 374 }
Chris@82 375 {
Chris@82 376 V Tk, Tn, Tj, Tm;
Chris@82 377 Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 378 Tk = VZMUL(Ti, Tj);
Chris@82 379 Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 380 Tn = VZMUL(Tl, Tm);
Chris@82 381 To = VSUB(Tk, Tn);
Chris@82 382 T1G = VADD(Tk, Tn);
Chris@82 383 }
Chris@82 384 {
Chris@82 385 V Ts, Tv, Tr, Tu;
Chris@82 386 Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 387 Ts = VZMUL(Tq, Tr);
Chris@82 388 Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 389 Tv = VZMUL(Tt, Tu);
Chris@82 390 Tw = VSUB(Ts, Tv);
Chris@82 391 T1K = VADD(Ts, Tv);
Chris@82 392 }
Chris@82 393 {
Chris@82 394 V TK, TM, TJ, TL;
Chris@82 395 TJ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 396 TK = VZMUL(T3, TJ);
Chris@82 397 TL = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 398 TM = VZMUL(Td, TL);
Chris@82 399 TN = VSUB(TK, TM);
Chris@82 400 T1E = VADD(TK, TM);
Chris@82 401 }
Chris@82 402 {
Chris@82 403 V TP, TS, TO, TR;
Chris@82 404 TO = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 405 TP = VZMUL(T2, TO);
Chris@82 406 TR = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 407 TS = VZMUL(TQ, TR);
Chris@82 408 TT = VSUB(TP, TS);
Chris@82 409 T1H = VADD(TP, TS);
Chris@82 410 }
Chris@82 411 {
Chris@82 412 V Tz, TC, Ty, TB;
Chris@82 413 Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 414 Tz = VZMUL(Tx, Ty);
Chris@82 415 TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 416 TC = VZMUL(TA, TB);
Chris@82 417 TD = VSUB(Tz, TC);
Chris@82 418 T1N = VADD(Tz, TC);
Chris@82 419 }
Chris@82 420 TU = VSUB(TN, TT);
Chris@82 421 T1n = VSUB(Th, To);
Chris@82 422 T1o = VSUB(Tw, TD);
Chris@82 423 T18 = VSUB(T10, T17);
Chris@82 424 Tp = VADD(Th, To);
Chris@82 425 TE = VADD(Tw, TD);
Chris@82 426 TF = VADD(Tp, TE);
Chris@82 427 T26 = VADD(T1K, T1L);
Chris@82 428 T27 = VADD(T1N, T1O);
Chris@82 429 T28 = VADD(T26, T27);
Chris@82 430 T1M = VSUB(T1K, T1L);
Chris@82 431 T1P = VSUB(T1N, T1O);
Chris@82 432 T1W = VADD(T1M, T1P);
Chris@82 433 T1b = VADD(TN, TT);
Chris@82 434 T1c = VADD(T10, T17);
Chris@82 435 T1k = VADD(T1b, T1c);
Chris@82 436 T23 = VADD(T1D, T1E);
Chris@82 437 T24 = VADD(T1G, T1H);
Chris@82 438 T25 = VADD(T23, T24);
Chris@82 439 T1F = VSUB(T1D, T1E);
Chris@82 440 T1I = VSUB(T1G, T1H);
Chris@82 441 T1V = VADD(T1F, T1I);
Chris@82 442 }
Chris@82 443 T1B = VADD(T7, TF);
Chris@82 444 T1C = VBYI(VADD(T1j, T1k));
Chris@82 445 ST(&(x[WS(rs, 15)]), VSUB(T1B, T1C), ms, &(x[WS(rs, 1)]));
Chris@82 446 ST(&(x[WS(rs, 5)]), VADD(T1B, T1C), ms, &(x[WS(rs, 1)]));
Chris@82 447 {
Chris@82 448 V T29, T2b, T2c, T2g, T2i, T2e, T2f, T2h, T2d;
Chris@82 449 T29 = VMUL(LDK(KP559016994), VSUB(T25, T28));
Chris@82 450 T2b = VADD(T25, T28);
Chris@82 451 T2c = VFNMS(LDK(KP250000000), T2b, T2a);
Chris@82 452 T2e = VSUB(T23, T24);
Chris@82 453 T2f = VSUB(T26, T27);
Chris@82 454 T2g = VBYI(VFMA(LDK(KP951056516), T2e, VMUL(LDK(KP587785252), T2f)));
Chris@82 455 T2i = VBYI(VFNMS(LDK(KP951056516), T2f, VMUL(LDK(KP587785252), T2e)));
Chris@82 456 ST(&(x[0]), VADD(T2a, T2b), ms, &(x[0]));
Chris@82 457 T2h = VSUB(T2c, T29);
Chris@82 458 ST(&(x[WS(rs, 8)]), VSUB(T2h, T2i), ms, &(x[0]));
Chris@82 459 ST(&(x[WS(rs, 12)]), VADD(T2i, T2h), ms, &(x[0]));
Chris@82 460 T2d = VADD(T29, T2c);
Chris@82 461 ST(&(x[WS(rs, 4)]), VSUB(T2d, T2g), ms, &(x[0]));
Chris@82 462 ST(&(x[WS(rs, 16)]), VADD(T2g, T2d), ms, &(x[0]));
Chris@82 463 }
Chris@82 464 {
Chris@82 465 V T1Z, T1X, T1Y, T1R, T21, T1J, T1Q, T22, T20;
Chris@82 466 T1Z = VMUL(LDK(KP559016994), VSUB(T1V, T1W));
Chris@82 467 T1X = VADD(T1V, T1W);
Chris@82 468 T1Y = VFNMS(LDK(KP250000000), T1X, T1U);
Chris@82 469 T1J = VSUB(T1F, T1I);
Chris@82 470 T1Q = VSUB(T1M, T1P);
Chris@82 471 T1R = VBYI(VFNMS(LDK(KP951056516), T1Q, VMUL(LDK(KP587785252), T1J)));
Chris@82 472 T21 = VBYI(VFMA(LDK(KP951056516), T1J, VMUL(LDK(KP587785252), T1Q)));
Chris@82 473 ST(&(x[WS(rs, 10)]), VADD(T1U, T1X), ms, &(x[0]));
Chris@82 474 T22 = VADD(T1Z, T1Y);
Chris@82 475 ST(&(x[WS(rs, 6)]), VADD(T21, T22), ms, &(x[0]));
Chris@82 476 ST(&(x[WS(rs, 14)]), VSUB(T22, T21), ms, &(x[0]));
Chris@82 477 T20 = VSUB(T1Y, T1Z);
Chris@82 478 ST(&(x[WS(rs, 2)]), VADD(T1R, T20), ms, &(x[0]));
Chris@82 479 ST(&(x[WS(rs, 18)]), VSUB(T20, T1R), ms, &(x[0]));
Chris@82 480 }
Chris@82 481 {
Chris@82 482 V T19, T1p, T1w, T1u, T1m, T1x, TI, T1t;
Chris@82 483 T19 = VFNMS(LDK(KP951056516), T18, VMUL(LDK(KP587785252), TU));
Chris@82 484 T1p = VFNMS(LDK(KP951056516), T1o, VMUL(LDK(KP587785252), T1n));
Chris@82 485 T1w = VFMA(LDK(KP951056516), T1n, VMUL(LDK(KP587785252), T1o));
Chris@82 486 T1u = VFMA(LDK(KP951056516), TU, VMUL(LDK(KP587785252), T18));
Chris@82 487 {
Chris@82 488 V T1d, T1l, TG, TH;
Chris@82 489 T1d = VMUL(LDK(KP559016994), VSUB(T1b, T1c));
Chris@82 490 T1l = VFNMS(LDK(KP250000000), T1k, T1j);
Chris@82 491 T1m = VSUB(T1d, T1l);
Chris@82 492 T1x = VADD(T1d, T1l);
Chris@82 493 TG = VFNMS(LDK(KP250000000), TF, T7);
Chris@82 494 TH = VMUL(LDK(KP559016994), VSUB(Tp, TE));
Chris@82 495 TI = VSUB(TG, TH);
Chris@82 496 T1t = VADD(TH, TG);
Chris@82 497 }
Chris@82 498 {
Chris@82 499 V T1a, T1q, T1z, T1A;
Chris@82 500 T1a = VSUB(TI, T19);
Chris@82 501 T1q = VBYI(VSUB(T1m, T1p));
Chris@82 502 ST(&(x[WS(rs, 17)]), VSUB(T1a, T1q), ms, &(x[WS(rs, 1)]));
Chris@82 503 ST(&(x[WS(rs, 3)]), VADD(T1a, T1q), ms, &(x[WS(rs, 1)]));
Chris@82 504 T1z = VADD(T1t, T1u);
Chris@82 505 T1A = VBYI(VSUB(T1x, T1w));
Chris@82 506 ST(&(x[WS(rs, 11)]), VSUB(T1z, T1A), ms, &(x[WS(rs, 1)]));
Chris@82 507 ST(&(x[WS(rs, 9)]), VADD(T1z, T1A), ms, &(x[WS(rs, 1)]));
Chris@82 508 }
Chris@82 509 {
Chris@82 510 V T1r, T1s, T1v, T1y;
Chris@82 511 T1r = VADD(TI, T19);
Chris@82 512 T1s = VBYI(VADD(T1p, T1m));
Chris@82 513 ST(&(x[WS(rs, 13)]), VSUB(T1r, T1s), ms, &(x[WS(rs, 1)]));
Chris@82 514 ST(&(x[WS(rs, 7)]), VADD(T1r, T1s), ms, &(x[WS(rs, 1)]));
Chris@82 515 T1v = VSUB(T1t, T1u);
Chris@82 516 T1y = VBYI(VADD(T1w, T1x));
Chris@82 517 ST(&(x[WS(rs, 19)]), VSUB(T1v, T1y), ms, &(x[WS(rs, 1)]));
Chris@82 518 ST(&(x[WS(rs, 1)]), VADD(T1v, T1y), ms, &(x[WS(rs, 1)]));
Chris@82 519 }
Chris@82 520 }
Chris@82 521 }
Chris@82 522 }
Chris@82 523 }
Chris@82 524 VLEAVE();
Chris@82 525 }
Chris@82 526
Chris@82 527 static const tw_instr twinstr[] = {
Chris@82 528 VTW(0, 1),
Chris@82 529 VTW(0, 3),
Chris@82 530 VTW(0, 9),
Chris@82 531 VTW(0, 19),
Chris@82 532 {TW_NEXT, VL, 0}
Chris@82 533 };
Chris@82 534
Chris@82 535 static const ct_desc desc = { 20, XSIMD_STRING("t3bv_20"), twinstr, &GENUS, {126, 80, 12, 0}, 0, 0, 0 };
Chris@82 536
Chris@82 537 void XSIMD(codelet_t3bv_20) (planner *p) {
Chris@82 538 X(kdft_dit_register) (p, t3bv_20, &desc);
Chris@82 539 }
Chris@82 540 #endif