annotate src/fftw-3.3.3/dft/simd/common/t3fv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:38:55 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3fv_20 -include t3f.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 138 FP additions, 118 FP multiplications,
Chris@10 32 * (or, 92 additions, 72 multiplications, 46 fused multiply/add),
Chris@10 33 * 90 stack variables, 4 constants, and 40 memory accesses
Chris@10 34 */
Chris@10 35 #include "t3f.h"
Chris@10 36
Chris@10 37 static void t3fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 40 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@10 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 43 {
Chris@10 44 INT m;
Chris@10 45 R *x;
Chris@10 46 x = ri;
Chris@10 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@10 48 V T1k, T1w, T1r, T1z, T1o, T1y, T1v, T1h;
Chris@10 49 {
Chris@10 50 V T2, T8, T3, Td;
Chris@10 51 T2 = LDW(&(W[0]));
Chris@10 52 T8 = LDW(&(W[TWVL * 2]));
Chris@10 53 T3 = LDW(&(W[TWVL * 4]));
Chris@10 54 Td = LDW(&(W[TWVL * 6]));
Chris@10 55 {
Chris@10 56 V T7, TM, T1F, T23, T1p, Tp, T1j, T27, T1P, T1I, T1i, T1L, T28, T1S, T1q;
Chris@10 57 V TE, T1n, T1d, T26, T2e;
Chris@10 58 {
Chris@10 59 V T1, TK, T5, TH;
Chris@10 60 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 61 TK = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@10 62 T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 63 TH = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 64 {
Chris@10 65 V TA, Tx, TU, T1O, T14, Th, T1G, T1R, T1b, T1J, To, Ts, TV, Tv, TO;
Chris@10 66 V TQ, TT, Ty, TB;
Chris@10 67 {
Chris@10 68 V Tq, Tt, T17, T1a, Tk, Tn;
Chris@10 69 {
Chris@10 70 V Tl, Ti, T15, T18, TZ, Tc, T6, Tb, Tf, T10, T12, TL;
Chris@10 71 {
Chris@10 72 V TJ, Ta, T9, T4;
Chris@10 73 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 74 TA = VZMULJ(T2, T8);
Chris@10 75 T9 = VZMUL(T2, T8);
Chris@10 76 Tx = VZMUL(T8, T3);
Chris@10 77 Tl = VZMULJ(T8, T3);
Chris@10 78 T4 = VZMUL(T2, T3);
Chris@10 79 Tq = VZMULJ(T2, T3);
Chris@10 80 Tt = VZMULJ(T2, Td);
Chris@10 81 Ti = VZMULJ(T8, Td);
Chris@10 82 T15 = VZMULJ(TA, Td);
Chris@10 83 T18 = VZMULJ(TA, T3);
Chris@10 84 TU = VZMUL(TA, T3);
Chris@10 85 TJ = VZMULJ(T9, Td);
Chris@10 86 TZ = VZMUL(T9, T3);
Chris@10 87 Tc = VZMULJ(T9, T3);
Chris@10 88 T6 = VZMULJ(T4, T5);
Chris@10 89 Tb = VZMULJ(T9, Ta);
Chris@10 90 Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@10 91 T10 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@10 92 T12 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 93 TL = VZMULJ(TJ, TK);
Chris@10 94 }
Chris@10 95 {
Chris@10 96 V T1D, T11, T13, T19, T1E, Tg, T16, TI, Te, Tj, Tm;
Chris@10 97 T16 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@10 98 TI = VZMULJ(Tc, TH);
Chris@10 99 Te = VZMULJ(Tc, Td);
Chris@10 100 T7 = VSUB(T1, T6);
Chris@10 101 T1D = VADD(T1, T6);
Chris@10 102 T11 = VZMULJ(TZ, T10);
Chris@10 103 T13 = VZMULJ(T8, T12);
Chris@10 104 T19 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 105 T17 = VZMULJ(T15, T16);
Chris@10 106 TM = VSUB(TI, TL);
Chris@10 107 T1E = VADD(TI, TL);
Chris@10 108 Tg = VZMULJ(Te, Tf);
Chris@10 109 Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@10 110 Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 111 T1O = VADD(T11, T13);
Chris@10 112 T14 = VSUB(T11, T13);
Chris@10 113 T1a = VZMULJ(T18, T19);
Chris@10 114 T1F = VSUB(T1D, T1E);
Chris@10 115 T23 = VADD(T1D, T1E);
Chris@10 116 Th = VSUB(Tb, Tg);
Chris@10 117 T1G = VADD(Tb, Tg);
Chris@10 118 Tk = VZMULJ(Ti, Tj);
Chris@10 119 Tn = VZMULJ(Tl, Tm);
Chris@10 120 }
Chris@10 121 }
Chris@10 122 {
Chris@10 123 V Tr, Tu, TN, TP, TS;
Chris@10 124 Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 125 T1R = VADD(T17, T1a);
Chris@10 126 T1b = VSUB(T17, T1a);
Chris@10 127 Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@10 128 TN = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 129 TP = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@10 130 TS = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 131 T1J = VADD(Tk, Tn);
Chris@10 132 To = VSUB(Tk, Tn);
Chris@10 133 Ts = VZMULJ(Tq, Tr);
Chris@10 134 TV = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 135 Tv = VZMULJ(Tt, Tu);
Chris@10 136 TO = VZMULJ(T3, TN);
Chris@10 137 TQ = VZMULJ(Td, TP);
Chris@10 138 TT = VZMULJ(T2, TS);
Chris@10 139 Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@10 140 TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 141 }
Chris@10 142 }
Chris@10 143 {
Chris@10 144 V T1N, Tw, T1H, TR, Tz, TC, T1c, TX, T1K, TW;
Chris@10 145 T1p = VSUB(Th, To);
Chris@10 146 Tp = VADD(Th, To);
Chris@10 147 TW = VZMULJ(TU, TV);
Chris@10 148 T1N = VADD(Ts, Tv);
Chris@10 149 Tw = VSUB(Ts, Tv);
Chris@10 150 T1H = VADD(TO, TQ);
Chris@10 151 TR = VSUB(TO, TQ);
Chris@10 152 Tz = VZMULJ(Tx, Ty);
Chris@10 153 TC = VZMULJ(TA, TB);
Chris@10 154 T1j = VSUB(T1b, T14);
Chris@10 155 T1c = VADD(T14, T1b);
Chris@10 156 TX = VSUB(TT, TW);
Chris@10 157 T1K = VADD(TT, TW);
Chris@10 158 T27 = VADD(T1N, T1O);
Chris@10 159 T1P = VSUB(T1N, T1O);
Chris@10 160 {
Chris@10 161 V TD, T1Q, T24, TY, T25;
Chris@10 162 TD = VSUB(Tz, TC);
Chris@10 163 T1Q = VADD(Tz, TC);
Chris@10 164 T1I = VSUB(T1G, T1H);
Chris@10 165 T24 = VADD(T1G, T1H);
Chris@10 166 TY = VADD(TR, TX);
Chris@10 167 T1i = VSUB(TX, TR);
Chris@10 168 T25 = VADD(T1J, T1K);
Chris@10 169 T1L = VSUB(T1J, T1K);
Chris@10 170 T28 = VADD(T1Q, T1R);
Chris@10 171 T1S = VSUB(T1Q, T1R);
Chris@10 172 T1q = VSUB(Tw, TD);
Chris@10 173 TE = VADD(Tw, TD);
Chris@10 174 T1n = VSUB(T1c, TY);
Chris@10 175 T1d = VADD(TY, T1c);
Chris@10 176 T26 = VADD(T24, T25);
Chris@10 177 T2e = VSUB(T24, T25);
Chris@10 178 }
Chris@10 179 }
Chris@10 180 }
Chris@10 181 }
Chris@10 182 {
Chris@10 183 V T1M, T1Z, T1Y, T1T, T29, T2f, T1g, TF, T1m, T1e;
Chris@10 184 T1M = VADD(T1I, T1L);
Chris@10 185 T1Z = VSUB(T1I, T1L);
Chris@10 186 T1Y = VSUB(T1P, T1S);
Chris@10 187 T1T = VADD(T1P, T1S);
Chris@10 188 T29 = VADD(T27, T28);
Chris@10 189 T2f = VSUB(T27, T28);
Chris@10 190 T1g = VSUB(Tp, TE);
Chris@10 191 TF = VADD(Tp, TE);
Chris@10 192 T1m = VFNMS(LDK(KP250000000), T1d, TM);
Chris@10 193 T1e = VADD(TM, T1d);
Chris@10 194 {
Chris@10 195 V T1W, T2c, T1f, T2i, T2g, T22, T20, T1V, T2b, T1U, T2a, TG;
Chris@10 196 T1k = VFMA(LDK(KP618033988), T1j, T1i);
Chris@10 197 T1w = VFNMS(LDK(KP618033988), T1i, T1j);
Chris@10 198 T1W = VSUB(T1M, T1T);
Chris@10 199 T1U = VADD(T1M, T1T);
Chris@10 200 T2c = VSUB(T26, T29);
Chris@10 201 T2a = VADD(T26, T29);
Chris@10 202 T1f = VFNMS(LDK(KP250000000), TF, T7);
Chris@10 203 TG = VADD(T7, TF);
Chris@10 204 T2i = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T2e, T2f));
Chris@10 205 T2g = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T2f, T2e));
Chris@10 206 T22 = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1Y, T1Z));
Chris@10 207 T20 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1Z, T1Y));
Chris@10 208 ST(&(x[WS(rs, 10)]), VADD(T1F, T1U), ms, &(x[0]));
Chris@10 209 T1V = VFNMS(LDK(KP250000000), T1U, T1F);
Chris@10 210 ST(&(x[0]), VADD(T23, T2a), ms, &(x[0]));
Chris@10 211 T2b = VFNMS(LDK(KP250000000), T2a, T23);
Chris@10 212 ST(&(x[WS(rs, 15)]), VFMAI(T1e, TG), ms, &(x[WS(rs, 1)]));
Chris@10 213 ST(&(x[WS(rs, 5)]), VFNMSI(T1e, TG), ms, &(x[WS(rs, 1)]));
Chris@10 214 T1r = VFMA(LDK(KP618033988), T1q, T1p);
Chris@10 215 T1z = VFNMS(LDK(KP618033988), T1p, T1q);
Chris@10 216 {
Chris@10 217 V T21, T1X, T2h, T2d;
Chris@10 218 T21 = VFMA(LDK(KP559016994), T1W, T1V);
Chris@10 219 T1X = VFNMS(LDK(KP559016994), T1W, T1V);
Chris@10 220 T2h = VFNMS(LDK(KP559016994), T2c, T2b);
Chris@10 221 T2d = VFMA(LDK(KP559016994), T2c, T2b);
Chris@10 222 ST(&(x[WS(rs, 18)]), VFNMSI(T20, T1X), ms, &(x[0]));
Chris@10 223 ST(&(x[WS(rs, 2)]), VFMAI(T20, T1X), ms, &(x[0]));
Chris@10 224 ST(&(x[WS(rs, 14)]), VFMAI(T22, T21), ms, &(x[0]));
Chris@10 225 ST(&(x[WS(rs, 6)]), VFNMSI(T22, T21), ms, &(x[0]));
Chris@10 226 ST(&(x[WS(rs, 16)]), VFNMSI(T2g, T2d), ms, &(x[0]));
Chris@10 227 ST(&(x[WS(rs, 4)]), VFMAI(T2g, T2d), ms, &(x[0]));
Chris@10 228 ST(&(x[WS(rs, 12)]), VFMAI(T2i, T2h), ms, &(x[0]));
Chris@10 229 ST(&(x[WS(rs, 8)]), VFNMSI(T2i, T2h), ms, &(x[0]));
Chris@10 230 T1o = VFNMS(LDK(KP559016994), T1n, T1m);
Chris@10 231 T1y = VFMA(LDK(KP559016994), T1n, T1m);
Chris@10 232 T1v = VFNMS(LDK(KP559016994), T1g, T1f);
Chris@10 233 T1h = VFMA(LDK(KP559016994), T1g, T1f);
Chris@10 234 }
Chris@10 235 }
Chris@10 236 }
Chris@10 237 }
Chris@10 238 }
Chris@10 239 {
Chris@10 240 V T1C, T1A, T1s, T1u, T1l, T1t, T1B, T1x;
Chris@10 241 T1C = VFMA(LDK(KP951056516), T1z, T1y);
Chris@10 242 T1A = VFNMS(LDK(KP951056516), T1z, T1y);
Chris@10 243 T1s = VFMA(LDK(KP951056516), T1r, T1o);
Chris@10 244 T1u = VFNMS(LDK(KP951056516), T1r, T1o);
Chris@10 245 T1l = VFMA(LDK(KP951056516), T1k, T1h);
Chris@10 246 T1t = VFNMS(LDK(KP951056516), T1k, T1h);
Chris@10 247 T1B = VFMA(LDK(KP951056516), T1w, T1v);
Chris@10 248 T1x = VFNMS(LDK(KP951056516), T1w, T1v);
Chris@10 249 ST(&(x[WS(rs, 11)]), VFMAI(T1u, T1t), ms, &(x[WS(rs, 1)]));
Chris@10 250 ST(&(x[WS(rs, 9)]), VFNMSI(T1u, T1t), ms, &(x[WS(rs, 1)]));
Chris@10 251 ST(&(x[WS(rs, 19)]), VFMAI(T1s, T1l), ms, &(x[WS(rs, 1)]));
Chris@10 252 ST(&(x[WS(rs, 1)]), VFNMSI(T1s, T1l), ms, &(x[WS(rs, 1)]));
Chris@10 253 ST(&(x[WS(rs, 3)]), VFMAI(T1A, T1x), ms, &(x[WS(rs, 1)]));
Chris@10 254 ST(&(x[WS(rs, 17)]), VFNMSI(T1A, T1x), ms, &(x[WS(rs, 1)]));
Chris@10 255 ST(&(x[WS(rs, 7)]), VFMAI(T1C, T1B), ms, &(x[WS(rs, 1)]));
Chris@10 256 ST(&(x[WS(rs, 13)]), VFNMSI(T1C, T1B), ms, &(x[WS(rs, 1)]));
Chris@10 257 }
Chris@10 258 }
Chris@10 259 }
Chris@10 260 VLEAVE();
Chris@10 261 }
Chris@10 262
Chris@10 263 static const tw_instr twinstr[] = {
Chris@10 264 VTW(0, 1),
Chris@10 265 VTW(0, 3),
Chris@10 266 VTW(0, 9),
Chris@10 267 VTW(0, 19),
Chris@10 268 {TW_NEXT, VL, 0}
Chris@10 269 };
Chris@10 270
Chris@10 271 static const ct_desc desc = { 20, XSIMD_STRING("t3fv_20"), twinstr, &GENUS, {92, 72, 46, 0}, 0, 0, 0 };
Chris@10 272
Chris@10 273 void XSIMD(codelet_t3fv_20) (planner *p) {
Chris@10 274 X(kdft_dit_register) (p, t3fv_20, &desc);
Chris@10 275 }
Chris@10 276 #else /* HAVE_FMA */
Chris@10 277
Chris@10 278 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3fv_20 -include t3f.h */
Chris@10 279
Chris@10 280 /*
Chris@10 281 * This function contains 138 FP additions, 92 FP multiplications,
Chris@10 282 * (or, 126 additions, 80 multiplications, 12 fused multiply/add),
Chris@10 283 * 73 stack variables, 4 constants, and 40 memory accesses
Chris@10 284 */
Chris@10 285 #include "t3f.h"
Chris@10 286
Chris@10 287 static void t3fv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 288 {
Chris@10 289 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@10 290 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 291 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 292 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 293 {
Chris@10 294 INT m;
Chris@10 295 R *x;
Chris@10 296 x = ri;
Chris@10 297 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@10 298 V T2, T8, T9, TA, T3, Tc, T4, TZ, T18, Tl, Tq, Tx, TU, Td, Te;
Chris@10 299 V T15, Ti, Tt, TJ;
Chris@10 300 T2 = LDW(&(W[0]));
Chris@10 301 T8 = LDW(&(W[TWVL * 2]));
Chris@10 302 T9 = VZMUL(T2, T8);
Chris@10 303 TA = VZMULJ(T2, T8);
Chris@10 304 T3 = LDW(&(W[TWVL * 4]));
Chris@10 305 Tc = VZMULJ(T9, T3);
Chris@10 306 T4 = VZMUL(T2, T3);
Chris@10 307 TZ = VZMUL(T9, T3);
Chris@10 308 T18 = VZMULJ(TA, T3);
Chris@10 309 Tl = VZMULJ(T8, T3);
Chris@10 310 Tq = VZMULJ(T2, T3);
Chris@10 311 Tx = VZMUL(T8, T3);
Chris@10 312 TU = VZMUL(TA, T3);
Chris@10 313 Td = LDW(&(W[TWVL * 6]));
Chris@10 314 Te = VZMULJ(Tc, Td);
Chris@10 315 T15 = VZMULJ(TA, Td);
Chris@10 316 Ti = VZMULJ(T8, Td);
Chris@10 317 Tt = VZMULJ(T2, Td);
Chris@10 318 TJ = VZMULJ(T9, Td);
Chris@10 319 {
Chris@10 320 V T7, TM, T1U, T2d, T1i, T1p, T1q, T1j, Tp, TE, TF, T26, T27, T2b, T1M;
Chris@10 321 V T1P, T1V, TY, T1c, T1d, T23, T24, T2a, T1F, T1I, T1W, TG, T1e;
Chris@10 322 {
Chris@10 323 V T1, TL, T6, TI, TK, T5, TH, T1S, T1T;
Chris@10 324 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 325 TK = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@10 326 TL = VZMULJ(TJ, TK);
Chris@10 327 T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 328 T6 = VZMULJ(T4, T5);
Chris@10 329 TH = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 330 TI = VZMULJ(Tc, TH);
Chris@10 331 T7 = VSUB(T1, T6);
Chris@10 332 TM = VSUB(TI, TL);
Chris@10 333 T1S = VADD(T1, T6);
Chris@10 334 T1T = VADD(TI, TL);
Chris@10 335 T1U = VSUB(T1S, T1T);
Chris@10 336 T2d = VADD(T1S, T1T);
Chris@10 337 }
Chris@10 338 {
Chris@10 339 V Th, T1K, T14, T1E, T1b, T1H, To, T1N, Tw, T1D, TR, T1L, TX, T1O, TD;
Chris@10 340 V T1G;
Chris@10 341 {
Chris@10 342 V Tb, Tg, Ta, Tf;
Chris@10 343 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 344 Tb = VZMULJ(T9, Ta);
Chris@10 345 Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@10 346 Tg = VZMULJ(Te, Tf);
Chris@10 347 Th = VSUB(Tb, Tg);
Chris@10 348 T1K = VADD(Tb, Tg);
Chris@10 349 }
Chris@10 350 {
Chris@10 351 V T11, T13, T10, T12;
Chris@10 352 T10 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@10 353 T11 = VZMULJ(TZ, T10);
Chris@10 354 T12 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 355 T13 = VZMULJ(T8, T12);
Chris@10 356 T14 = VSUB(T11, T13);
Chris@10 357 T1E = VADD(T11, T13);
Chris@10 358 }
Chris@10 359 {
Chris@10 360 V T17, T1a, T16, T19;
Chris@10 361 T16 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@10 362 T17 = VZMULJ(T15, T16);
Chris@10 363 T19 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 364 T1a = VZMULJ(T18, T19);
Chris@10 365 T1b = VSUB(T17, T1a);
Chris@10 366 T1H = VADD(T17, T1a);
Chris@10 367 }
Chris@10 368 {
Chris@10 369 V Tk, Tn, Tj, Tm;
Chris@10 370 Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@10 371 Tk = VZMULJ(Ti, Tj);
Chris@10 372 Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 373 Tn = VZMULJ(Tl, Tm);
Chris@10 374 To = VSUB(Tk, Tn);
Chris@10 375 T1N = VADD(Tk, Tn);
Chris@10 376 }
Chris@10 377 {
Chris@10 378 V Ts, Tv, Tr, Tu;
Chris@10 379 Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 380 Ts = VZMULJ(Tq, Tr);
Chris@10 381 Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@10 382 Tv = VZMULJ(Tt, Tu);
Chris@10 383 Tw = VSUB(Ts, Tv);
Chris@10 384 T1D = VADD(Ts, Tv);
Chris@10 385 }
Chris@10 386 {
Chris@10 387 V TO, TQ, TN, TP;
Chris@10 388 TN = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 389 TO = VZMULJ(T3, TN);
Chris@10 390 TP = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@10 391 TQ = VZMULJ(Td, TP);
Chris@10 392 TR = VSUB(TO, TQ);
Chris@10 393 T1L = VADD(TO, TQ);
Chris@10 394 }
Chris@10 395 {
Chris@10 396 V TT, TW, TS, TV;
Chris@10 397 TS = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 398 TT = VZMULJ(T2, TS);
Chris@10 399 TV = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 400 TW = VZMULJ(TU, TV);
Chris@10 401 TX = VSUB(TT, TW);
Chris@10 402 T1O = VADD(TT, TW);
Chris@10 403 }
Chris@10 404 {
Chris@10 405 V Tz, TC, Ty, TB;
Chris@10 406 Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@10 407 Tz = VZMULJ(Tx, Ty);
Chris@10 408 TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 409 TC = VZMULJ(TA, TB);
Chris@10 410 TD = VSUB(Tz, TC);
Chris@10 411 T1G = VADD(Tz, TC);
Chris@10 412 }
Chris@10 413 T1i = VSUB(TX, TR);
Chris@10 414 T1p = VSUB(Th, To);
Chris@10 415 T1q = VSUB(Tw, TD);
Chris@10 416 T1j = VSUB(T1b, T14);
Chris@10 417 Tp = VADD(Th, To);
Chris@10 418 TE = VADD(Tw, TD);
Chris@10 419 TF = VADD(Tp, TE);
Chris@10 420 T26 = VADD(T1D, T1E);
Chris@10 421 T27 = VADD(T1G, T1H);
Chris@10 422 T2b = VADD(T26, T27);
Chris@10 423 T1M = VSUB(T1K, T1L);
Chris@10 424 T1P = VSUB(T1N, T1O);
Chris@10 425 T1V = VADD(T1M, T1P);
Chris@10 426 TY = VADD(TR, TX);
Chris@10 427 T1c = VADD(T14, T1b);
Chris@10 428 T1d = VADD(TY, T1c);
Chris@10 429 T23 = VADD(T1K, T1L);
Chris@10 430 T24 = VADD(T1N, T1O);
Chris@10 431 T2a = VADD(T23, T24);
Chris@10 432 T1F = VSUB(T1D, T1E);
Chris@10 433 T1I = VSUB(T1G, T1H);
Chris@10 434 T1W = VADD(T1F, T1I);
Chris@10 435 }
Chris@10 436 TG = VADD(T7, TF);
Chris@10 437 T1e = VBYI(VADD(TM, T1d));
Chris@10 438 ST(&(x[WS(rs, 5)]), VSUB(TG, T1e), ms, &(x[WS(rs, 1)]));
Chris@10 439 ST(&(x[WS(rs, 15)]), VADD(TG, T1e), ms, &(x[WS(rs, 1)]));
Chris@10 440 {
Chris@10 441 V T2c, T2e, T2f, T29, T2i, T25, T28, T2h, T2g;
Chris@10 442 T2c = VMUL(LDK(KP559016994), VSUB(T2a, T2b));
Chris@10 443 T2e = VADD(T2a, T2b);
Chris@10 444 T2f = VFNMS(LDK(KP250000000), T2e, T2d);
Chris@10 445 T25 = VSUB(T23, T24);
Chris@10 446 T28 = VSUB(T26, T27);
Chris@10 447 T29 = VBYI(VFMA(LDK(KP951056516), T25, VMUL(LDK(KP587785252), T28)));
Chris@10 448 T2i = VBYI(VFNMS(LDK(KP587785252), T25, VMUL(LDK(KP951056516), T28)));
Chris@10 449 ST(&(x[0]), VADD(T2d, T2e), ms, &(x[0]));
Chris@10 450 T2h = VSUB(T2f, T2c);
Chris@10 451 ST(&(x[WS(rs, 8)]), VSUB(T2h, T2i), ms, &(x[0]));
Chris@10 452 ST(&(x[WS(rs, 12)]), VADD(T2i, T2h), ms, &(x[0]));
Chris@10 453 T2g = VADD(T2c, T2f);
Chris@10 454 ST(&(x[WS(rs, 4)]), VADD(T29, T2g), ms, &(x[0]));
Chris@10 455 ST(&(x[WS(rs, 16)]), VSUB(T2g, T29), ms, &(x[0]));
Chris@10 456 }
Chris@10 457 {
Chris@10 458 V T1Z, T1X, T1Y, T1R, T22, T1J, T1Q, T21, T20;
Chris@10 459 T1Z = VMUL(LDK(KP559016994), VSUB(T1V, T1W));
Chris@10 460 T1X = VADD(T1V, T1W);
Chris@10 461 T1Y = VFNMS(LDK(KP250000000), T1X, T1U);
Chris@10 462 T1J = VSUB(T1F, T1I);
Chris@10 463 T1Q = VSUB(T1M, T1P);
Chris@10 464 T1R = VBYI(VFNMS(LDK(KP587785252), T1Q, VMUL(LDK(KP951056516), T1J)));
Chris@10 465 T22 = VBYI(VFMA(LDK(KP951056516), T1Q, VMUL(LDK(KP587785252), T1J)));
Chris@10 466 ST(&(x[WS(rs, 10)]), VADD(T1U, T1X), ms, &(x[0]));
Chris@10 467 T21 = VADD(T1Z, T1Y);
Chris@10 468 ST(&(x[WS(rs, 6)]), VSUB(T21, T22), ms, &(x[0]));
Chris@10 469 ST(&(x[WS(rs, 14)]), VADD(T22, T21), ms, &(x[0]));
Chris@10 470 T20 = VSUB(T1Y, T1Z);
Chris@10 471 ST(&(x[WS(rs, 2)]), VADD(T1R, T20), ms, &(x[0]));
Chris@10 472 ST(&(x[WS(rs, 18)]), VSUB(T20, T1R), ms, &(x[0]));
Chris@10 473 }
Chris@10 474 {
Chris@10 475 V T1k, T1r, T1z, T1w, T1o, T1y, T1h, T1v;
Chris@10 476 T1k = VFMA(LDK(KP951056516), T1i, VMUL(LDK(KP587785252), T1j));
Chris@10 477 T1r = VFMA(LDK(KP951056516), T1p, VMUL(LDK(KP587785252), T1q));
Chris@10 478 T1z = VFNMS(LDK(KP587785252), T1p, VMUL(LDK(KP951056516), T1q));
Chris@10 479 T1w = VFNMS(LDK(KP587785252), T1i, VMUL(LDK(KP951056516), T1j));
Chris@10 480 {
Chris@10 481 V T1m, T1n, T1f, T1g;
Chris@10 482 T1m = VFMS(LDK(KP250000000), T1d, TM);
Chris@10 483 T1n = VMUL(LDK(KP559016994), VSUB(T1c, TY));
Chris@10 484 T1o = VADD(T1m, T1n);
Chris@10 485 T1y = VSUB(T1n, T1m);
Chris@10 486 T1f = VMUL(LDK(KP559016994), VSUB(Tp, TE));
Chris@10 487 T1g = VFNMS(LDK(KP250000000), TF, T7);
Chris@10 488 T1h = VADD(T1f, T1g);
Chris@10 489 T1v = VSUB(T1g, T1f);
Chris@10 490 }
Chris@10 491 {
Chris@10 492 V T1l, T1s, T1B, T1C;
Chris@10 493 T1l = VADD(T1h, T1k);
Chris@10 494 T1s = VBYI(VSUB(T1o, T1r));
Chris@10 495 ST(&(x[WS(rs, 19)]), VSUB(T1l, T1s), ms, &(x[WS(rs, 1)]));
Chris@10 496 ST(&(x[WS(rs, 1)]), VADD(T1l, T1s), ms, &(x[WS(rs, 1)]));
Chris@10 497 T1B = VADD(T1v, T1w);
Chris@10 498 T1C = VBYI(VADD(T1z, T1y));
Chris@10 499 ST(&(x[WS(rs, 13)]), VSUB(T1B, T1C), ms, &(x[WS(rs, 1)]));
Chris@10 500 ST(&(x[WS(rs, 7)]), VADD(T1B, T1C), ms, &(x[WS(rs, 1)]));
Chris@10 501 }
Chris@10 502 {
Chris@10 503 V T1t, T1u, T1x, T1A;
Chris@10 504 T1t = VSUB(T1h, T1k);
Chris@10 505 T1u = VBYI(VADD(T1r, T1o));
Chris@10 506 ST(&(x[WS(rs, 11)]), VSUB(T1t, T1u), ms, &(x[WS(rs, 1)]));
Chris@10 507 ST(&(x[WS(rs, 9)]), VADD(T1t, T1u), ms, &(x[WS(rs, 1)]));
Chris@10 508 T1x = VSUB(T1v, T1w);
Chris@10 509 T1A = VBYI(VSUB(T1y, T1z));
Chris@10 510 ST(&(x[WS(rs, 17)]), VSUB(T1x, T1A), ms, &(x[WS(rs, 1)]));
Chris@10 511 ST(&(x[WS(rs, 3)]), VADD(T1x, T1A), ms, &(x[WS(rs, 1)]));
Chris@10 512 }
Chris@10 513 }
Chris@10 514 }
Chris@10 515 }
Chris@10 516 }
Chris@10 517 VLEAVE();
Chris@10 518 }
Chris@10 519
Chris@10 520 static const tw_instr twinstr[] = {
Chris@10 521 VTW(0, 1),
Chris@10 522 VTW(0, 3),
Chris@10 523 VTW(0, 9),
Chris@10 524 VTW(0, 19),
Chris@10 525 {TW_NEXT, VL, 0}
Chris@10 526 };
Chris@10 527
Chris@10 528 static const ct_desc desc = { 20, XSIMD_STRING("t3fv_20"), twinstr, &GENUS, {126, 80, 12, 0}, 0, 0, 0 };
Chris@10 529
Chris@10 530 void XSIMD(codelet_t3fv_20) (planner *p) {
Chris@10 531 X(kdft_dit_register) (p, t3fv_20, &desc);
Chris@10 532 }
Chris@10 533 #endif /* HAVE_FMA */