annotate src/fftw-3.3.3/dft/simd/common/t3bv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:39:23 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3bv_20 -include t3b.h -sign 1 */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 138 FP additions, 118 FP multiplications,
Chris@10 32 * (or, 92 additions, 72 multiplications, 46 fused multiply/add),
Chris@10 33 * 90 stack variables, 4 constants, and 40 memory accesses
Chris@10 34 */
Chris@10 35 #include "t3b.h"
Chris@10 36
Chris@10 37 static void t3bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 40 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@10 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 43 {
Chris@10 44 INT m;
Chris@10 45 R *x;
Chris@10 46 x = ii;
Chris@10 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@10 48 V T19, T1u, T1p, T1x, T1m, T1w, T1t, TI;
Chris@10 49 {
Chris@10 50 V T2, T8, T3, Td;
Chris@10 51 T2 = LDW(&(W[0]));
Chris@10 52 T8 = LDW(&(W[TWVL * 2]));
Chris@10 53 T3 = LDW(&(W[TWVL * 4]));
Chris@10 54 Td = LDW(&(W[TWVL * 6]));
Chris@10 55 {
Chris@10 56 V T7, T1g, T1F, T23, T1n, Tp, T18, T27, T1P, T1I, TU, T1L, T28, T1S, T1o;
Chris@10 57 V TE, T1l, T1j, T26, T2e;
Chris@10 58 {
Chris@10 59 V T1, T1e, T5, T1b;
Chris@10 60 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 61 T1e = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@10 62 T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 63 T1b = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 64 {
Chris@10 65 V TA, Tx, TQ, T1O, T10, Th, T1G, T1R, T17, T1J, To, Ts, TR, Tv, TK;
Chris@10 66 V TM, TP, Ty, TB;
Chris@10 67 {
Chris@10 68 V Tq, Tt, T13, T16, Tk, Tn;
Chris@10 69 {
Chris@10 70 V Tl, Ti, T11, T14, TV, Tc, T6, Tb, Tf, TW, TY, T1f;
Chris@10 71 {
Chris@10 72 V T1d, Ta, T9, T4;
Chris@10 73 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 74 TA = VZMULJ(T2, T8);
Chris@10 75 T9 = VZMUL(T2, T8);
Chris@10 76 Tx = VZMUL(T8, T3);
Chris@10 77 Tl = VZMULJ(T8, T3);
Chris@10 78 T4 = VZMUL(T2, T3);
Chris@10 79 Tq = VZMULJ(T2, T3);
Chris@10 80 Tt = VZMULJ(T2, Td);
Chris@10 81 Ti = VZMULJ(T8, Td);
Chris@10 82 T11 = VZMULJ(TA, Td);
Chris@10 83 T14 = VZMULJ(TA, T3);
Chris@10 84 TQ = VZMUL(TA, T3);
Chris@10 85 T1d = VZMULJ(T9, Td);
Chris@10 86 TV = VZMUL(T9, T3);
Chris@10 87 Tc = VZMULJ(T9, T3);
Chris@10 88 T6 = VZMUL(T4, T5);
Chris@10 89 Tb = VZMUL(T9, Ta);
Chris@10 90 Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@10 91 TW = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@10 92 TY = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 93 T1f = VZMUL(T1d, T1e);
Chris@10 94 }
Chris@10 95 {
Chris@10 96 V T1D, TX, TZ, T15, T1E, Tg, T12, T1c, Te, Tj, Tm;
Chris@10 97 T12 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@10 98 T1c = VZMUL(Tc, T1b);
Chris@10 99 Te = VZMULJ(Tc, Td);
Chris@10 100 T7 = VSUB(T1, T6);
Chris@10 101 T1D = VADD(T1, T6);
Chris@10 102 TX = VZMUL(TV, TW);
Chris@10 103 TZ = VZMUL(T8, TY);
Chris@10 104 T15 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 105 T13 = VZMUL(T11, T12);
Chris@10 106 T1g = VSUB(T1c, T1f);
Chris@10 107 T1E = VADD(T1c, T1f);
Chris@10 108 Tg = VZMUL(Te, Tf);
Chris@10 109 Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@10 110 Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 111 T1O = VADD(TX, TZ);
Chris@10 112 T10 = VSUB(TX, TZ);
Chris@10 113 T16 = VZMUL(T14, T15);
Chris@10 114 T1F = VSUB(T1D, T1E);
Chris@10 115 T23 = VADD(T1D, T1E);
Chris@10 116 Th = VSUB(Tb, Tg);
Chris@10 117 T1G = VADD(Tb, Tg);
Chris@10 118 Tk = VZMUL(Ti, Tj);
Chris@10 119 Tn = VZMUL(Tl, Tm);
Chris@10 120 }
Chris@10 121 }
Chris@10 122 {
Chris@10 123 V Tr, Tu, TJ, TL, TO;
Chris@10 124 Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 125 T1R = VADD(T13, T16);
Chris@10 126 T17 = VSUB(T13, T16);
Chris@10 127 Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@10 128 TJ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 129 TL = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@10 130 TO = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 131 T1J = VADD(Tk, Tn);
Chris@10 132 To = VSUB(Tk, Tn);
Chris@10 133 Ts = VZMUL(Tq, Tr);
Chris@10 134 TR = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 135 Tv = VZMUL(Tt, Tu);
Chris@10 136 TK = VZMUL(T3, TJ);
Chris@10 137 TM = VZMUL(Td, TL);
Chris@10 138 TP = VZMUL(T2, TO);
Chris@10 139 Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@10 140 TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 141 }
Chris@10 142 }
Chris@10 143 {
Chris@10 144 V T1N, Tw, T1H, TN, Tz, TC, T1i, TT, T1K, TS;
Chris@10 145 T1n = VSUB(Th, To);
Chris@10 146 Tp = VADD(Th, To);
Chris@10 147 TS = VZMUL(TQ, TR);
Chris@10 148 T1N = VADD(Ts, Tv);
Chris@10 149 Tw = VSUB(Ts, Tv);
Chris@10 150 T1H = VADD(TK, TM);
Chris@10 151 TN = VSUB(TK, TM);
Chris@10 152 Tz = VZMUL(Tx, Ty);
Chris@10 153 TC = VZMUL(TA, TB);
Chris@10 154 T18 = VSUB(T10, T17);
Chris@10 155 T1i = VADD(T10, T17);
Chris@10 156 TT = VSUB(TP, TS);
Chris@10 157 T1K = VADD(TP, TS);
Chris@10 158 T27 = VADD(T1N, T1O);
Chris@10 159 T1P = VSUB(T1N, T1O);
Chris@10 160 {
Chris@10 161 V TD, T1Q, T24, T1h, T25;
Chris@10 162 TD = VSUB(Tz, TC);
Chris@10 163 T1Q = VADD(Tz, TC);
Chris@10 164 T1I = VSUB(T1G, T1H);
Chris@10 165 T24 = VADD(T1G, T1H);
Chris@10 166 T1h = VADD(TN, TT);
Chris@10 167 TU = VSUB(TN, TT);
Chris@10 168 T25 = VADD(T1J, T1K);
Chris@10 169 T1L = VSUB(T1J, T1K);
Chris@10 170 T28 = VADD(T1Q, T1R);
Chris@10 171 T1S = VSUB(T1Q, T1R);
Chris@10 172 T1o = VSUB(Tw, TD);
Chris@10 173 TE = VADD(Tw, TD);
Chris@10 174 T1l = VSUB(T1h, T1i);
Chris@10 175 T1j = VADD(T1h, T1i);
Chris@10 176 T26 = VADD(T24, T25);
Chris@10 177 T2e = VSUB(T24, T25);
Chris@10 178 }
Chris@10 179 }
Chris@10 180 }
Chris@10 181 }
Chris@10 182 {
Chris@10 183 V T1M, T1Z, T1Y, T1T, T29, T2f, TH, TF, T1k, T1C;
Chris@10 184 T1M = VADD(T1I, T1L);
Chris@10 185 T1Z = VSUB(T1I, T1L);
Chris@10 186 T1Y = VSUB(T1P, T1S);
Chris@10 187 T1T = VADD(T1P, T1S);
Chris@10 188 T29 = VADD(T27, T28);
Chris@10 189 T2f = VSUB(T27, T28);
Chris@10 190 TH = VSUB(Tp, TE);
Chris@10 191 TF = VADD(Tp, TE);
Chris@10 192 T1k = VFNMS(LDK(KP250000000), T1j, T1g);
Chris@10 193 T1C = VADD(T1g, T1j);
Chris@10 194 {
Chris@10 195 V T1W, T2c, TG, T2i, T2g, T22, T20, T1V, T2b, T1U, T2a, T1B;
Chris@10 196 T19 = VFMA(LDK(KP618033988), T18, TU);
Chris@10 197 T1u = VFNMS(LDK(KP618033988), TU, T18);
Chris@10 198 T1W = VSUB(T1M, T1T);
Chris@10 199 T1U = VADD(T1M, T1T);
Chris@10 200 T2c = VSUB(T26, T29);
Chris@10 201 T2a = VADD(T26, T29);
Chris@10 202 TG = VFNMS(LDK(KP250000000), TF, T7);
Chris@10 203 T1B = VADD(T7, TF);
Chris@10 204 T2i = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T2e, T2f));
Chris@10 205 T2g = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T2f, T2e));
Chris@10 206 T22 = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1Y, T1Z));
Chris@10 207 T20 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1Z, T1Y));
Chris@10 208 ST(&(x[WS(rs, 10)]), VADD(T1F, T1U), ms, &(x[0]));
Chris@10 209 T1V = VFNMS(LDK(KP250000000), T1U, T1F);
Chris@10 210 ST(&(x[0]), VADD(T23, T2a), ms, &(x[0]));
Chris@10 211 T2b = VFNMS(LDK(KP250000000), T2a, T23);
Chris@10 212 ST(&(x[WS(rs, 5)]), VFMAI(T1C, T1B), ms, &(x[WS(rs, 1)]));
Chris@10 213 ST(&(x[WS(rs, 15)]), VFNMSI(T1C, T1B), ms, &(x[WS(rs, 1)]));
Chris@10 214 T1p = VFMA(LDK(KP618033988), T1o, T1n);
Chris@10 215 T1x = VFNMS(LDK(KP618033988), T1n, T1o);
Chris@10 216 {
Chris@10 217 V T21, T1X, T2h, T2d;
Chris@10 218 T21 = VFMA(LDK(KP559016994), T1W, T1V);
Chris@10 219 T1X = VFNMS(LDK(KP559016994), T1W, T1V);
Chris@10 220 T2h = VFNMS(LDK(KP559016994), T2c, T2b);
Chris@10 221 T2d = VFMA(LDK(KP559016994), T2c, T2b);
Chris@10 222 ST(&(x[WS(rs, 18)]), VFMAI(T20, T1X), ms, &(x[0]));
Chris@10 223 ST(&(x[WS(rs, 2)]), VFNMSI(T20, T1X), ms, &(x[0]));
Chris@10 224 ST(&(x[WS(rs, 14)]), VFNMSI(T22, T21), ms, &(x[0]));
Chris@10 225 ST(&(x[WS(rs, 6)]), VFMAI(T22, T21), ms, &(x[0]));
Chris@10 226 ST(&(x[WS(rs, 16)]), VFMAI(T2g, T2d), ms, &(x[0]));
Chris@10 227 ST(&(x[WS(rs, 4)]), VFNMSI(T2g, T2d), ms, &(x[0]));
Chris@10 228 ST(&(x[WS(rs, 12)]), VFNMSI(T2i, T2h), ms, &(x[0]));
Chris@10 229 ST(&(x[WS(rs, 8)]), VFMAI(T2i, T2h), ms, &(x[0]));
Chris@10 230 T1m = VFMA(LDK(KP559016994), T1l, T1k);
Chris@10 231 T1w = VFNMS(LDK(KP559016994), T1l, T1k);
Chris@10 232 T1t = VFNMS(LDK(KP559016994), TH, TG);
Chris@10 233 TI = VFMA(LDK(KP559016994), TH, TG);
Chris@10 234 }
Chris@10 235 }
Chris@10 236 }
Chris@10 237 }
Chris@10 238 }
Chris@10 239 {
Chris@10 240 V T1A, T1y, T1q, T1s, T1a, T1r, T1z, T1v;
Chris@10 241 T1A = VFMA(LDK(KP951056516), T1x, T1w);
Chris@10 242 T1y = VFNMS(LDK(KP951056516), T1x, T1w);
Chris@10 243 T1q = VFMA(LDK(KP951056516), T1p, T1m);
Chris@10 244 T1s = VFNMS(LDK(KP951056516), T1p, T1m);
Chris@10 245 T1a = VFNMS(LDK(KP951056516), T19, TI);
Chris@10 246 T1r = VFMA(LDK(KP951056516), T19, TI);
Chris@10 247 T1z = VFNMS(LDK(KP951056516), T1u, T1t);
Chris@10 248 T1v = VFMA(LDK(KP951056516), T1u, T1t);
Chris@10 249 ST(&(x[WS(rs, 9)]), VFMAI(T1s, T1r), ms, &(x[WS(rs, 1)]));
Chris@10 250 ST(&(x[WS(rs, 11)]), VFNMSI(T1s, T1r), ms, &(x[WS(rs, 1)]));
Chris@10 251 ST(&(x[WS(rs, 1)]), VFMAI(T1q, T1a), ms, &(x[WS(rs, 1)]));
Chris@10 252 ST(&(x[WS(rs, 19)]), VFNMSI(T1q, T1a), ms, &(x[WS(rs, 1)]));
Chris@10 253 ST(&(x[WS(rs, 17)]), VFMAI(T1y, T1v), ms, &(x[WS(rs, 1)]));
Chris@10 254 ST(&(x[WS(rs, 3)]), VFNMSI(T1y, T1v), ms, &(x[WS(rs, 1)]));
Chris@10 255 ST(&(x[WS(rs, 13)]), VFMAI(T1A, T1z), ms, &(x[WS(rs, 1)]));
Chris@10 256 ST(&(x[WS(rs, 7)]), VFNMSI(T1A, T1z), ms, &(x[WS(rs, 1)]));
Chris@10 257 }
Chris@10 258 }
Chris@10 259 }
Chris@10 260 VLEAVE();
Chris@10 261 }
Chris@10 262
Chris@10 263 static const tw_instr twinstr[] = {
Chris@10 264 VTW(0, 1),
Chris@10 265 VTW(0, 3),
Chris@10 266 VTW(0, 9),
Chris@10 267 VTW(0, 19),
Chris@10 268 {TW_NEXT, VL, 0}
Chris@10 269 };
Chris@10 270
Chris@10 271 static const ct_desc desc = { 20, XSIMD_STRING("t3bv_20"), twinstr, &GENUS, {92, 72, 46, 0}, 0, 0, 0 };
Chris@10 272
Chris@10 273 void XSIMD(codelet_t3bv_20) (planner *p) {
Chris@10 274 X(kdft_dit_register) (p, t3bv_20, &desc);
Chris@10 275 }
Chris@10 276 #else /* HAVE_FMA */
Chris@10 277
Chris@10 278 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 20 -name t3bv_20 -include t3b.h -sign 1 */
Chris@10 279
Chris@10 280 /*
Chris@10 281 * This function contains 138 FP additions, 92 FP multiplications,
Chris@10 282 * (or, 126 additions, 80 multiplications, 12 fused multiply/add),
Chris@10 283 * 73 stack variables, 4 constants, and 40 memory accesses
Chris@10 284 */
Chris@10 285 #include "t3b.h"
Chris@10 286
Chris@10 287 static void t3bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 288 {
Chris@10 289 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@10 290 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 291 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 292 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 293 {
Chris@10 294 INT m;
Chris@10 295 R *x;
Chris@10 296 x = ii;
Chris@10 297 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@10 298 V T2, T8, T9, TA, T3, Tc, T4, TV, T14, Tl, Tq, Tx, TQ, Td, Te;
Chris@10 299 V T1g, Ti, Tt, T11;
Chris@10 300 T2 = LDW(&(W[0]));
Chris@10 301 T8 = LDW(&(W[TWVL * 2]));
Chris@10 302 T9 = VZMUL(T2, T8);
Chris@10 303 TA = VZMULJ(T2, T8);
Chris@10 304 T3 = LDW(&(W[TWVL * 4]));
Chris@10 305 Tc = VZMULJ(T9, T3);
Chris@10 306 T4 = VZMUL(T2, T3);
Chris@10 307 TV = VZMUL(T9, T3);
Chris@10 308 T14 = VZMULJ(TA, T3);
Chris@10 309 Tl = VZMULJ(T8, T3);
Chris@10 310 Tq = VZMULJ(T2, T3);
Chris@10 311 Tx = VZMUL(T8, T3);
Chris@10 312 TQ = VZMUL(TA, T3);
Chris@10 313 Td = LDW(&(W[TWVL * 6]));
Chris@10 314 Te = VZMULJ(Tc, Td);
Chris@10 315 T1g = VZMULJ(T9, Td);
Chris@10 316 Ti = VZMULJ(T8, Td);
Chris@10 317 Tt = VZMULJ(T2, Td);
Chris@10 318 T11 = VZMULJ(TA, Td);
Chris@10 319 {
Chris@10 320 V T7, T1j, T1U, T2a, TU, T1n, T1o, T18, Tp, TE, TF, T26, T27, T28, T1M;
Chris@10 321 V T1P, T1W, T1b, T1c, T1k, T23, T24, T25, T1F, T1I, T1V, T1B, T1C;
Chris@10 322 {
Chris@10 323 V T1, T1i, T6, T1f, T1h, T5, T1e, T1S, T1T;
Chris@10 324 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 325 T1h = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@10 326 T1i = VZMUL(T1g, T1h);
Chris@10 327 T5 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 328 T6 = VZMUL(T4, T5);
Chris@10 329 T1e = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 330 T1f = VZMUL(Tc, T1e);
Chris@10 331 T7 = VSUB(T1, T6);
Chris@10 332 T1j = VSUB(T1f, T1i);
Chris@10 333 T1S = VADD(T1, T6);
Chris@10 334 T1T = VADD(T1f, T1i);
Chris@10 335 T1U = VSUB(T1S, T1T);
Chris@10 336 T2a = VADD(T1S, T1T);
Chris@10 337 }
Chris@10 338 {
Chris@10 339 V Th, T1D, T10, T1L, T17, T1O, To, T1G, Tw, T1K, TN, T1E, TT, T1H, TD;
Chris@10 340 V T1N;
Chris@10 341 {
Chris@10 342 V Tb, Tg, Ta, Tf;
Chris@10 343 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 344 Tb = VZMUL(T9, Ta);
Chris@10 345 Tf = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@10 346 Tg = VZMUL(Te, Tf);
Chris@10 347 Th = VSUB(Tb, Tg);
Chris@10 348 T1D = VADD(Tb, Tg);
Chris@10 349 }
Chris@10 350 {
Chris@10 351 V TX, TZ, TW, TY;
Chris@10 352 TW = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@10 353 TX = VZMUL(TV, TW);
Chris@10 354 TY = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 355 TZ = VZMUL(T8, TY);
Chris@10 356 T10 = VSUB(TX, TZ);
Chris@10 357 T1L = VADD(TX, TZ);
Chris@10 358 }
Chris@10 359 {
Chris@10 360 V T13, T16, T12, T15;
Chris@10 361 T12 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@10 362 T13 = VZMUL(T11, T12);
Chris@10 363 T15 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 364 T16 = VZMUL(T14, T15);
Chris@10 365 T17 = VSUB(T13, T16);
Chris@10 366 T1O = VADD(T13, T16);
Chris@10 367 }
Chris@10 368 {
Chris@10 369 V Tk, Tn, Tj, Tm;
Chris@10 370 Tj = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@10 371 Tk = VZMUL(Ti, Tj);
Chris@10 372 Tm = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 373 Tn = VZMUL(Tl, Tm);
Chris@10 374 To = VSUB(Tk, Tn);
Chris@10 375 T1G = VADD(Tk, Tn);
Chris@10 376 }
Chris@10 377 {
Chris@10 378 V Ts, Tv, Tr, Tu;
Chris@10 379 Tr = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 380 Ts = VZMUL(Tq, Tr);
Chris@10 381 Tu = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@10 382 Tv = VZMUL(Tt, Tu);
Chris@10 383 Tw = VSUB(Ts, Tv);
Chris@10 384 T1K = VADD(Ts, Tv);
Chris@10 385 }
Chris@10 386 {
Chris@10 387 V TK, TM, TJ, TL;
Chris@10 388 TJ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 389 TK = VZMUL(T3, TJ);
Chris@10 390 TL = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@10 391 TM = VZMUL(Td, TL);
Chris@10 392 TN = VSUB(TK, TM);
Chris@10 393 T1E = VADD(TK, TM);
Chris@10 394 }
Chris@10 395 {
Chris@10 396 V TP, TS, TO, TR;
Chris@10 397 TO = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 398 TP = VZMUL(T2, TO);
Chris@10 399 TR = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 400 TS = VZMUL(TQ, TR);
Chris@10 401 TT = VSUB(TP, TS);
Chris@10 402 T1H = VADD(TP, TS);
Chris@10 403 }
Chris@10 404 {
Chris@10 405 V Tz, TC, Ty, TB;
Chris@10 406 Ty = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@10 407 Tz = VZMUL(Tx, Ty);
Chris@10 408 TB = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 409 TC = VZMUL(TA, TB);
Chris@10 410 TD = VSUB(Tz, TC);
Chris@10 411 T1N = VADD(Tz, TC);
Chris@10 412 }
Chris@10 413 TU = VSUB(TN, TT);
Chris@10 414 T1n = VSUB(Th, To);
Chris@10 415 T1o = VSUB(Tw, TD);
Chris@10 416 T18 = VSUB(T10, T17);
Chris@10 417 Tp = VADD(Th, To);
Chris@10 418 TE = VADD(Tw, TD);
Chris@10 419 TF = VADD(Tp, TE);
Chris@10 420 T26 = VADD(T1K, T1L);
Chris@10 421 T27 = VADD(T1N, T1O);
Chris@10 422 T28 = VADD(T26, T27);
Chris@10 423 T1M = VSUB(T1K, T1L);
Chris@10 424 T1P = VSUB(T1N, T1O);
Chris@10 425 T1W = VADD(T1M, T1P);
Chris@10 426 T1b = VADD(TN, TT);
Chris@10 427 T1c = VADD(T10, T17);
Chris@10 428 T1k = VADD(T1b, T1c);
Chris@10 429 T23 = VADD(T1D, T1E);
Chris@10 430 T24 = VADD(T1G, T1H);
Chris@10 431 T25 = VADD(T23, T24);
Chris@10 432 T1F = VSUB(T1D, T1E);
Chris@10 433 T1I = VSUB(T1G, T1H);
Chris@10 434 T1V = VADD(T1F, T1I);
Chris@10 435 }
Chris@10 436 T1B = VADD(T7, TF);
Chris@10 437 T1C = VBYI(VADD(T1j, T1k));
Chris@10 438 ST(&(x[WS(rs, 15)]), VSUB(T1B, T1C), ms, &(x[WS(rs, 1)]));
Chris@10 439 ST(&(x[WS(rs, 5)]), VADD(T1B, T1C), ms, &(x[WS(rs, 1)]));
Chris@10 440 {
Chris@10 441 V T29, T2b, T2c, T2g, T2i, T2e, T2f, T2h, T2d;
Chris@10 442 T29 = VMUL(LDK(KP559016994), VSUB(T25, T28));
Chris@10 443 T2b = VADD(T25, T28);
Chris@10 444 T2c = VFNMS(LDK(KP250000000), T2b, T2a);
Chris@10 445 T2e = VSUB(T23, T24);
Chris@10 446 T2f = VSUB(T26, T27);
Chris@10 447 T2g = VBYI(VFMA(LDK(KP951056516), T2e, VMUL(LDK(KP587785252), T2f)));
Chris@10 448 T2i = VBYI(VFNMS(LDK(KP951056516), T2f, VMUL(LDK(KP587785252), T2e)));
Chris@10 449 ST(&(x[0]), VADD(T2a, T2b), ms, &(x[0]));
Chris@10 450 T2h = VSUB(T2c, T29);
Chris@10 451 ST(&(x[WS(rs, 8)]), VSUB(T2h, T2i), ms, &(x[0]));
Chris@10 452 ST(&(x[WS(rs, 12)]), VADD(T2i, T2h), ms, &(x[0]));
Chris@10 453 T2d = VADD(T29, T2c);
Chris@10 454 ST(&(x[WS(rs, 4)]), VSUB(T2d, T2g), ms, &(x[0]));
Chris@10 455 ST(&(x[WS(rs, 16)]), VADD(T2g, T2d), ms, &(x[0]));
Chris@10 456 }
Chris@10 457 {
Chris@10 458 V T1Z, T1X, T1Y, T1R, T21, T1J, T1Q, T22, T20;
Chris@10 459 T1Z = VMUL(LDK(KP559016994), VSUB(T1V, T1W));
Chris@10 460 T1X = VADD(T1V, T1W);
Chris@10 461 T1Y = VFNMS(LDK(KP250000000), T1X, T1U);
Chris@10 462 T1J = VSUB(T1F, T1I);
Chris@10 463 T1Q = VSUB(T1M, T1P);
Chris@10 464 T1R = VBYI(VFNMS(LDK(KP951056516), T1Q, VMUL(LDK(KP587785252), T1J)));
Chris@10 465 T21 = VBYI(VFMA(LDK(KP951056516), T1J, VMUL(LDK(KP587785252), T1Q)));
Chris@10 466 ST(&(x[WS(rs, 10)]), VADD(T1U, T1X), ms, &(x[0]));
Chris@10 467 T22 = VADD(T1Z, T1Y);
Chris@10 468 ST(&(x[WS(rs, 6)]), VADD(T21, T22), ms, &(x[0]));
Chris@10 469 ST(&(x[WS(rs, 14)]), VSUB(T22, T21), ms, &(x[0]));
Chris@10 470 T20 = VSUB(T1Y, T1Z);
Chris@10 471 ST(&(x[WS(rs, 2)]), VADD(T1R, T20), ms, &(x[0]));
Chris@10 472 ST(&(x[WS(rs, 18)]), VSUB(T20, T1R), ms, &(x[0]));
Chris@10 473 }
Chris@10 474 {
Chris@10 475 V T19, T1p, T1w, T1u, T1m, T1x, TI, T1t;
Chris@10 476 T19 = VFNMS(LDK(KP951056516), T18, VMUL(LDK(KP587785252), TU));
Chris@10 477 T1p = VFNMS(LDK(KP951056516), T1o, VMUL(LDK(KP587785252), T1n));
Chris@10 478 T1w = VFMA(LDK(KP951056516), T1n, VMUL(LDK(KP587785252), T1o));
Chris@10 479 T1u = VFMA(LDK(KP951056516), TU, VMUL(LDK(KP587785252), T18));
Chris@10 480 {
Chris@10 481 V T1d, T1l, TG, TH;
Chris@10 482 T1d = VMUL(LDK(KP559016994), VSUB(T1b, T1c));
Chris@10 483 T1l = VFNMS(LDK(KP250000000), T1k, T1j);
Chris@10 484 T1m = VSUB(T1d, T1l);
Chris@10 485 T1x = VADD(T1d, T1l);
Chris@10 486 TG = VFNMS(LDK(KP250000000), TF, T7);
Chris@10 487 TH = VMUL(LDK(KP559016994), VSUB(Tp, TE));
Chris@10 488 TI = VSUB(TG, TH);
Chris@10 489 T1t = VADD(TH, TG);
Chris@10 490 }
Chris@10 491 {
Chris@10 492 V T1a, T1q, T1z, T1A;
Chris@10 493 T1a = VSUB(TI, T19);
Chris@10 494 T1q = VBYI(VSUB(T1m, T1p));
Chris@10 495 ST(&(x[WS(rs, 17)]), VSUB(T1a, T1q), ms, &(x[WS(rs, 1)]));
Chris@10 496 ST(&(x[WS(rs, 3)]), VADD(T1a, T1q), ms, &(x[WS(rs, 1)]));
Chris@10 497 T1z = VADD(T1t, T1u);
Chris@10 498 T1A = VBYI(VSUB(T1x, T1w));
Chris@10 499 ST(&(x[WS(rs, 11)]), VSUB(T1z, T1A), ms, &(x[WS(rs, 1)]));
Chris@10 500 ST(&(x[WS(rs, 9)]), VADD(T1z, T1A), ms, &(x[WS(rs, 1)]));
Chris@10 501 }
Chris@10 502 {
Chris@10 503 V T1r, T1s, T1v, T1y;
Chris@10 504 T1r = VADD(TI, T19);
Chris@10 505 T1s = VBYI(VADD(T1p, T1m));
Chris@10 506 ST(&(x[WS(rs, 13)]), VSUB(T1r, T1s), ms, &(x[WS(rs, 1)]));
Chris@10 507 ST(&(x[WS(rs, 7)]), VADD(T1r, T1s), ms, &(x[WS(rs, 1)]));
Chris@10 508 T1v = VSUB(T1t, T1u);
Chris@10 509 T1y = VBYI(VADD(T1w, T1x));
Chris@10 510 ST(&(x[WS(rs, 19)]), VSUB(T1v, T1y), ms, &(x[WS(rs, 1)]));
Chris@10 511 ST(&(x[WS(rs, 1)]), VADD(T1v, T1y), ms, &(x[WS(rs, 1)]));
Chris@10 512 }
Chris@10 513 }
Chris@10 514 }
Chris@10 515 }
Chris@10 516 }
Chris@10 517 VLEAVE();
Chris@10 518 }
Chris@10 519
Chris@10 520 static const tw_instr twinstr[] = {
Chris@10 521 VTW(0, 1),
Chris@10 522 VTW(0, 3),
Chris@10 523 VTW(0, 9),
Chris@10 524 VTW(0, 19),
Chris@10 525 {TW_NEXT, VL, 0}
Chris@10 526 };
Chris@10 527
Chris@10 528 static const ct_desc desc = { 20, XSIMD_STRING("t3bv_20"), twinstr, &GENUS, {126, 80, 12, 0}, 0, 0, 0 };
Chris@10 529
Chris@10 530 void XSIMD(codelet_t3bv_20) (planner *p) {
Chris@10 531 X(kdft_dit_register) (p, t3bv_20, &desc);
Chris@10 532 }
Chris@10 533 #endif /* HAVE_FMA */