annotate src/fftw-3.3.5/dft/simd/common/t1bv_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:44:21 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name t1bv_15 -include t1b.h -sign 1 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 92 FP additions, 77 FP multiplications,
Chris@42 32 * (or, 50 additions, 35 multiplications, 42 fused multiply/add),
Chris@42 33 * 81 stack variables, 8 constants, and 30 memory accesses
Chris@42 34 */
Chris@42 35 #include "t1b.h"
Chris@42 36
Chris@42 37 static void t1bv_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
Chris@42 40 DVK(KP910592997, +0.910592997310029334643087372129977886038870291);
Chris@42 41 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 43 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 44 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 45 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 46 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 47 {
Chris@42 48 INT m;
Chris@42 49 R *x;
Chris@42 50 x = ii;
Chris@42 51 for (m = mb, W = W + (mb * ((TWVL / VL) * 28)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 28), MAKE_VOLATILE_STRIDE(15, rs)) {
Chris@42 52 V Tq, Ty, Th, TV, TK, Ts, T1f, T7, Tu, TA, TC, Tj, Tk, T1g, Tf;
Chris@42 53 {
Chris@42 54 V T1, T4, T2, T9, Te;
Chris@42 55 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 56 T4 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 57 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 58 {
Chris@42 59 V T8, Tp, Tx, Tg;
Chris@42 60 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 61 Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 62 Tx = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 63 Tg = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 64 {
Chris@42 65 V Tb, Td, Tr, T6, Tt, Tz, TB, Ti;
Chris@42 66 {
Chris@42 67 V T5, T3, Ta, Tc;
Chris@42 68 Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 69 Tc = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 70 T5 = BYTW(&(W[TWVL * 18]), T4);
Chris@42 71 T3 = BYTW(&(W[TWVL * 8]), T2);
Chris@42 72 T9 = BYTW(&(W[TWVL * 4]), T8);
Chris@42 73 Tq = BYTW(&(W[TWVL * 10]), Tp);
Chris@42 74 Ty = BYTW(&(W[TWVL * 16]), Tx);
Chris@42 75 Th = BYTW(&(W[TWVL * 22]), Tg);
Chris@42 76 Tb = BYTW(&(W[TWVL * 14]), Ta);
Chris@42 77 Td = BYTW(&(W[TWVL * 24]), Tc);
Chris@42 78 Tr = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 79 TV = VSUB(T3, T5);
Chris@42 80 T6 = VADD(T3, T5);
Chris@42 81 Tt = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 82 }
Chris@42 83 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 84 TB = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 85 Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 86 Te = VADD(Tb, Td);
Chris@42 87 TK = VSUB(Tb, Td);
Chris@42 88 Ts = BYTW(&(W[TWVL * 20]), Tr);
Chris@42 89 T1f = VADD(T1, T6);
Chris@42 90 T7 = VFNMS(LDK(KP500000000), T6, T1);
Chris@42 91 Tu = BYTW(&(W[0]), Tt);
Chris@42 92 TA = BYTW(&(W[TWVL * 26]), Tz);
Chris@42 93 TC = BYTW(&(W[TWVL * 6]), TB);
Chris@42 94 Tj = BYTW(&(W[TWVL * 2]), Ti);
Chris@42 95 Tk = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 96 }
Chris@42 97 }
Chris@42 98 T1g = VADD(T9, Te);
Chris@42 99 Tf = VFNMS(LDK(KP500000000), Te, T9);
Chris@42 100 }
Chris@42 101 {
Chris@42 102 V Tv, TN, TD, TO, Tl;
Chris@42 103 Tv = VADD(Ts, Tu);
Chris@42 104 TN = VSUB(Ts, Tu);
Chris@42 105 TD = VADD(TA, TC);
Chris@42 106 TO = VSUB(TA, TC);
Chris@42 107 Tl = BYTW(&(W[TWVL * 12]), Tk);
Chris@42 108 {
Chris@42 109 V Tw, T1j, TX, TP, TE, T1k, TL, Tm;
Chris@42 110 Tw = VFNMS(LDK(KP500000000), Tv, Tq);
Chris@42 111 T1j = VADD(Tq, Tv);
Chris@42 112 TX = VADD(TN, TO);
Chris@42 113 TP = VSUB(TN, TO);
Chris@42 114 TE = VFNMS(LDK(KP500000000), TD, Ty);
Chris@42 115 T1k = VADD(Ty, TD);
Chris@42 116 TL = VSUB(Tj, Tl);
Chris@42 117 Tm = VADD(Tj, Tl);
Chris@42 118 {
Chris@42 119 V TT, TF, T1q, T1l, TW, TM, T1h, Tn;
Chris@42 120 TT = VSUB(Tw, TE);
Chris@42 121 TF = VADD(Tw, TE);
Chris@42 122 T1q = VSUB(T1j, T1k);
Chris@42 123 T1l = VADD(T1j, T1k);
Chris@42 124 TW = VADD(TK, TL);
Chris@42 125 TM = VSUB(TK, TL);
Chris@42 126 T1h = VADD(Th, Tm);
Chris@42 127 Tn = VFNMS(LDK(KP500000000), Tm, Th);
Chris@42 128 {
Chris@42 129 V T10, TY, T16, TQ, T1r, T1i, TS, To, TZ, T1e;
Chris@42 130 T10 = VSUB(TW, TX);
Chris@42 131 TY = VADD(TW, TX);
Chris@42 132 T16 = VFNMS(LDK(KP618033988), TM, TP);
Chris@42 133 TQ = VFMA(LDK(KP618033988), TP, TM);
Chris@42 134 T1r = VSUB(T1g, T1h);
Chris@42 135 T1i = VADD(T1g, T1h);
Chris@42 136 TS = VSUB(Tf, Tn);
Chris@42 137 To = VADD(Tf, Tn);
Chris@42 138 TZ = VFNMS(LDK(KP250000000), TY, TV);
Chris@42 139 T1e = VMUL(LDK(KP866025403), VADD(TV, TY));
Chris@42 140 {
Chris@42 141 V T1u, T1s, T1o, T18, TU, TG, TI, T19, T11, T1n, T1m;
Chris@42 142 T1u = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1q, T1r));
Chris@42 143 T1s = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1r, T1q));
Chris@42 144 T1m = VADD(T1i, T1l);
Chris@42 145 T1o = VSUB(T1i, T1l);
Chris@42 146 T18 = VFNMS(LDK(KP618033988), TS, TT);
Chris@42 147 TU = VFMA(LDK(KP618033988), TT, TS);
Chris@42 148 TG = VADD(To, TF);
Chris@42 149 TI = VSUB(To, TF);
Chris@42 150 T19 = VFNMS(LDK(KP559016994), T10, TZ);
Chris@42 151 T11 = VFMA(LDK(KP559016994), T10, TZ);
Chris@42 152 ST(&(x[0]), VADD(T1f, T1m), ms, &(x[0]));
Chris@42 153 T1n = VFNMS(LDK(KP250000000), T1m, T1f);
Chris@42 154 {
Chris@42 155 V T1a, T1c, T14, T12, T1p, T1t, T15, TJ, T1d, TH;
Chris@42 156 T1d = VADD(T7, TG);
Chris@42 157 TH = VFNMS(LDK(KP250000000), TG, T7);
Chris@42 158 T1a = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T19, T18));
Chris@42 159 T1c = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T19, T18));
Chris@42 160 T14 = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T11, TU));
Chris@42 161 T12 = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T11, TU));
Chris@42 162 T1p = VFNMS(LDK(KP559016994), T1o, T1n);
Chris@42 163 T1t = VFMA(LDK(KP559016994), T1o, T1n);
Chris@42 164 ST(&(x[WS(rs, 10)]), VFMAI(T1e, T1d), ms, &(x[0]));
Chris@42 165 ST(&(x[WS(rs, 5)]), VFNMSI(T1e, T1d), ms, &(x[WS(rs, 1)]));
Chris@42 166 T15 = VFNMS(LDK(KP559016994), TI, TH);
Chris@42 167 TJ = VFMA(LDK(KP559016994), TI, TH);
Chris@42 168 {
Chris@42 169 V T17, T1b, T13, TR;
Chris@42 170 ST(&(x[WS(rs, 12)]), VFNMSI(T1s, T1p), ms, &(x[0]));
Chris@42 171 ST(&(x[WS(rs, 3)]), VFMAI(T1s, T1p), ms, &(x[WS(rs, 1)]));
Chris@42 172 ST(&(x[WS(rs, 9)]), VFNMSI(T1u, T1t), ms, &(x[WS(rs, 1)]));
Chris@42 173 ST(&(x[WS(rs, 6)]), VFMAI(T1u, T1t), ms, &(x[0]));
Chris@42 174 T17 = VFNMS(LDK(KP823639103), T16, T15);
Chris@42 175 T1b = VFMA(LDK(KP823639103), T16, T15);
Chris@42 176 T13 = VFMA(LDK(KP823639103), TQ, TJ);
Chris@42 177 TR = VFNMS(LDK(KP823639103), TQ, TJ);
Chris@42 178 ST(&(x[WS(rs, 13)]), VFMAI(T1a, T17), ms, &(x[WS(rs, 1)]));
Chris@42 179 ST(&(x[WS(rs, 2)]), VFNMSI(T1a, T17), ms, &(x[0]));
Chris@42 180 ST(&(x[WS(rs, 8)]), VFMAI(T1c, T1b), ms, &(x[0]));
Chris@42 181 ST(&(x[WS(rs, 7)]), VFNMSI(T1c, T1b), ms, &(x[WS(rs, 1)]));
Chris@42 182 ST(&(x[WS(rs, 11)]), VFMAI(T14, T13), ms, &(x[WS(rs, 1)]));
Chris@42 183 ST(&(x[WS(rs, 4)]), VFNMSI(T14, T13), ms, &(x[0]));
Chris@42 184 ST(&(x[WS(rs, 14)]), VFNMSI(T12, TR), ms, &(x[0]));
Chris@42 185 ST(&(x[WS(rs, 1)]), VFMAI(T12, TR), ms, &(x[WS(rs, 1)]));
Chris@42 186 }
Chris@42 187 }
Chris@42 188 }
Chris@42 189 }
Chris@42 190 }
Chris@42 191 }
Chris@42 192 }
Chris@42 193 }
Chris@42 194 }
Chris@42 195 VLEAVE();
Chris@42 196 }
Chris@42 197
Chris@42 198 static const tw_instr twinstr[] = {
Chris@42 199 VTW(0, 1),
Chris@42 200 VTW(0, 2),
Chris@42 201 VTW(0, 3),
Chris@42 202 VTW(0, 4),
Chris@42 203 VTW(0, 5),
Chris@42 204 VTW(0, 6),
Chris@42 205 VTW(0, 7),
Chris@42 206 VTW(0, 8),
Chris@42 207 VTW(0, 9),
Chris@42 208 VTW(0, 10),
Chris@42 209 VTW(0, 11),
Chris@42 210 VTW(0, 12),
Chris@42 211 VTW(0, 13),
Chris@42 212 VTW(0, 14),
Chris@42 213 {TW_NEXT, VL, 0}
Chris@42 214 };
Chris@42 215
Chris@42 216 static const ct_desc desc = { 15, XSIMD_STRING("t1bv_15"), twinstr, &GENUS, {50, 35, 42, 0}, 0, 0, 0 };
Chris@42 217
Chris@42 218 void XSIMD(codelet_t1bv_15) (planner *p) {
Chris@42 219 X(kdft_dit_register) (p, t1bv_15, &desc);
Chris@42 220 }
Chris@42 221 #else /* HAVE_FMA */
Chris@42 222
Chris@42 223 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name t1bv_15 -include t1b.h -sign 1 */
Chris@42 224
Chris@42 225 /*
Chris@42 226 * This function contains 92 FP additions, 53 FP multiplications,
Chris@42 227 * (or, 78 additions, 39 multiplications, 14 fused multiply/add),
Chris@42 228 * 52 stack variables, 10 constants, and 30 memory accesses
Chris@42 229 */
Chris@42 230 #include "t1b.h"
Chris@42 231
Chris@42 232 static void t1bv_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 233 {
Chris@42 234 DVK(KP216506350, +0.216506350946109661690930792688234045867850657);
Chris@42 235 DVK(KP484122918, +0.484122918275927110647408174972799951354115213);
Chris@42 236 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 237 DVK(KP509036960, +0.509036960455127183450980863393907648510733164);
Chris@42 238 DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
Chris@42 239 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 240 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 241 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 242 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 243 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 244 {
Chris@42 245 INT m;
Chris@42 246 R *x;
Chris@42 247 x = ii;
Chris@42 248 for (m = mb, W = W + (mb * ((TWVL / VL) * 28)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 28), MAKE_VOLATILE_STRIDE(15, rs)) {
Chris@42 249 V Ts, TV, T1f, TZ, T10, Tb, Tm, Tt, T1j, T1k, T1l, TI, TM, TR, Tz;
Chris@42 250 V TD, TQ, T1g, T1h, T1i;
Chris@42 251 {
Chris@42 252 V TT, Tr, Tp, Tq, To, TU;
Chris@42 253 TT = LD(&(x[0]), ms, &(x[0]));
Chris@42 254 Tq = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 255 Tr = BYTW(&(W[TWVL * 18]), Tq);
Chris@42 256 To = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 257 Tp = BYTW(&(W[TWVL * 8]), To);
Chris@42 258 Ts = VSUB(Tp, Tr);
Chris@42 259 TU = VADD(Tp, Tr);
Chris@42 260 TV = VFNMS(LDK(KP500000000), TU, TT);
Chris@42 261 T1f = VADD(TT, TU);
Chris@42 262 }
Chris@42 263 {
Chris@42 264 V Tx, TG, TK, TB, T5, Ty, Tg, TH, Tl, TL, Ta, TC;
Chris@42 265 {
Chris@42 266 V Tw, TF, TJ, TA;
Chris@42 267 Tw = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 268 Tx = BYTW(&(W[TWVL * 4]), Tw);
Chris@42 269 TF = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 270 TG = BYTW(&(W[TWVL * 10]), TF);
Chris@42 271 TJ = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 272 TK = BYTW(&(W[TWVL * 16]), TJ);
Chris@42 273 TA = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 274 TB = BYTW(&(W[TWVL * 22]), TA);
Chris@42 275 }
Chris@42 276 {
Chris@42 277 V T2, T4, T1, T3;
Chris@42 278 T1 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 279 T2 = BYTW(&(W[TWVL * 14]), T1);
Chris@42 280 T3 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 281 T4 = BYTW(&(W[TWVL * 24]), T3);
Chris@42 282 T5 = VSUB(T2, T4);
Chris@42 283 Ty = VADD(T2, T4);
Chris@42 284 }
Chris@42 285 {
Chris@42 286 V Td, Tf, Tc, Te;
Chris@42 287 Tc = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 288 Td = BYTW(&(W[TWVL * 20]), Tc);
Chris@42 289 Te = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 290 Tf = BYTW(&(W[0]), Te);
Chris@42 291 Tg = VSUB(Td, Tf);
Chris@42 292 TH = VADD(Td, Tf);
Chris@42 293 }
Chris@42 294 {
Chris@42 295 V Ti, Tk, Th, Tj;
Chris@42 296 Th = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 297 Ti = BYTW(&(W[TWVL * 26]), Th);
Chris@42 298 Tj = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 299 Tk = BYTW(&(W[TWVL * 6]), Tj);
Chris@42 300 Tl = VSUB(Ti, Tk);
Chris@42 301 TL = VADD(Ti, Tk);
Chris@42 302 }
Chris@42 303 {
Chris@42 304 V T7, T9, T6, T8;
Chris@42 305 T6 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 306 T7 = BYTW(&(W[TWVL * 2]), T6);
Chris@42 307 T8 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 308 T9 = BYTW(&(W[TWVL * 12]), T8);
Chris@42 309 Ta = VSUB(T7, T9);
Chris@42 310 TC = VADD(T7, T9);
Chris@42 311 }
Chris@42 312 TZ = VSUB(T5, Ta);
Chris@42 313 T10 = VSUB(Tg, Tl);
Chris@42 314 Tb = VADD(T5, Ta);
Chris@42 315 Tm = VADD(Tg, Tl);
Chris@42 316 Tt = VADD(Tb, Tm);
Chris@42 317 T1j = VADD(TG, TH);
Chris@42 318 T1k = VADD(TK, TL);
Chris@42 319 T1l = VADD(T1j, T1k);
Chris@42 320 TI = VFNMS(LDK(KP500000000), TH, TG);
Chris@42 321 TM = VFNMS(LDK(KP500000000), TL, TK);
Chris@42 322 TR = VADD(TI, TM);
Chris@42 323 Tz = VFNMS(LDK(KP500000000), Ty, Tx);
Chris@42 324 TD = VFNMS(LDK(KP500000000), TC, TB);
Chris@42 325 TQ = VADD(Tz, TD);
Chris@42 326 T1g = VADD(Tx, Ty);
Chris@42 327 T1h = VADD(TB, TC);
Chris@42 328 T1i = VADD(T1g, T1h);
Chris@42 329 }
Chris@42 330 {
Chris@42 331 V T1o, T1m, T1n, T1s, T1t, T1q, T1r, T1u, T1p;
Chris@42 332 T1o = VMUL(LDK(KP559016994), VSUB(T1i, T1l));
Chris@42 333 T1m = VADD(T1i, T1l);
Chris@42 334 T1n = VFNMS(LDK(KP250000000), T1m, T1f);
Chris@42 335 T1q = VSUB(T1g, T1h);
Chris@42 336 T1r = VSUB(T1j, T1k);
Chris@42 337 T1s = VBYI(VFNMS(LDK(KP951056516), T1r, VMUL(LDK(KP587785252), T1q)));
Chris@42 338 T1t = VBYI(VFMA(LDK(KP951056516), T1q, VMUL(LDK(KP587785252), T1r)));
Chris@42 339 ST(&(x[0]), VADD(T1f, T1m), ms, &(x[0]));
Chris@42 340 T1u = VADD(T1o, T1n);
Chris@42 341 ST(&(x[WS(rs, 6)]), VADD(T1t, T1u), ms, &(x[0]));
Chris@42 342 ST(&(x[WS(rs, 9)]), VSUB(T1u, T1t), ms, &(x[WS(rs, 1)]));
Chris@42 343 T1p = VSUB(T1n, T1o);
Chris@42 344 ST(&(x[WS(rs, 3)]), VSUB(T1p, T1s), ms, &(x[WS(rs, 1)]));
Chris@42 345 ST(&(x[WS(rs, 12)]), VADD(T1s, T1p), ms, &(x[0]));
Chris@42 346 }
Chris@42 347 {
Chris@42 348 V T11, T18, T1e, TO, T16, Tv, T15, TY, T1d, T19, TE, TN;
Chris@42 349 T11 = VFMA(LDK(KP823639103), TZ, VMUL(LDK(KP509036960), T10));
Chris@42 350 T18 = VFNMS(LDK(KP823639103), T10, VMUL(LDK(KP509036960), TZ));
Chris@42 351 T1e = VBYI(VMUL(LDK(KP866025403), VADD(Ts, Tt)));
Chris@42 352 TE = VSUB(Tz, TD);
Chris@42 353 TN = VSUB(TI, TM);
Chris@42 354 TO = VFMA(LDK(KP951056516), TE, VMUL(LDK(KP587785252), TN));
Chris@42 355 T16 = VFNMS(LDK(KP951056516), TN, VMUL(LDK(KP587785252), TE));
Chris@42 356 {
Chris@42 357 V Tn, Tu, TS, TW, TX;
Chris@42 358 Tn = VMUL(LDK(KP484122918), VSUB(Tb, Tm));
Chris@42 359 Tu = VFNMS(LDK(KP216506350), Tt, VMUL(LDK(KP866025403), Ts));
Chris@42 360 Tv = VADD(Tn, Tu);
Chris@42 361 T15 = VSUB(Tn, Tu);
Chris@42 362 TS = VMUL(LDK(KP559016994), VSUB(TQ, TR));
Chris@42 363 TW = VADD(TQ, TR);
Chris@42 364 TX = VFNMS(LDK(KP250000000), TW, TV);
Chris@42 365 TY = VADD(TS, TX);
Chris@42 366 T1d = VADD(TV, TW);
Chris@42 367 T19 = VSUB(TX, TS);
Chris@42 368 }
Chris@42 369 {
Chris@42 370 V TP, T12, T1b, T1c;
Chris@42 371 ST(&(x[WS(rs, 5)]), VSUB(T1d, T1e), ms, &(x[WS(rs, 1)]));
Chris@42 372 ST(&(x[WS(rs, 10)]), VADD(T1e, T1d), ms, &(x[0]));
Chris@42 373 TP = VBYI(VADD(Tv, TO));
Chris@42 374 T12 = VSUB(TY, T11);
Chris@42 375 ST(&(x[WS(rs, 1)]), VADD(TP, T12), ms, &(x[WS(rs, 1)]));
Chris@42 376 ST(&(x[WS(rs, 14)]), VSUB(T12, TP), ms, &(x[0]));
Chris@42 377 T1b = VBYI(VSUB(T16, T15));
Chris@42 378 T1c = VSUB(T19, T18);
Chris@42 379 ST(&(x[WS(rs, 7)]), VADD(T1b, T1c), ms, &(x[WS(rs, 1)]));
Chris@42 380 ST(&(x[WS(rs, 8)]), VSUB(T1c, T1b), ms, &(x[0]));
Chris@42 381 {
Chris@42 382 V T17, T1a, T13, T14;
Chris@42 383 T17 = VBYI(VADD(T15, T16));
Chris@42 384 T1a = VADD(T18, T19);
Chris@42 385 ST(&(x[WS(rs, 2)]), VADD(T17, T1a), ms, &(x[0]));
Chris@42 386 ST(&(x[WS(rs, 13)]), VSUB(T1a, T17), ms, &(x[WS(rs, 1)]));
Chris@42 387 T13 = VBYI(VSUB(Tv, TO));
Chris@42 388 T14 = VADD(T11, TY);
Chris@42 389 ST(&(x[WS(rs, 4)]), VADD(T13, T14), ms, &(x[0]));
Chris@42 390 ST(&(x[WS(rs, 11)]), VSUB(T14, T13), ms, &(x[WS(rs, 1)]));
Chris@42 391 }
Chris@42 392 }
Chris@42 393 }
Chris@42 394 }
Chris@42 395 }
Chris@42 396 VLEAVE();
Chris@42 397 }
Chris@42 398
Chris@42 399 static const tw_instr twinstr[] = {
Chris@42 400 VTW(0, 1),
Chris@42 401 VTW(0, 2),
Chris@42 402 VTW(0, 3),
Chris@42 403 VTW(0, 4),
Chris@42 404 VTW(0, 5),
Chris@42 405 VTW(0, 6),
Chris@42 406 VTW(0, 7),
Chris@42 407 VTW(0, 8),
Chris@42 408 VTW(0, 9),
Chris@42 409 VTW(0, 10),
Chris@42 410 VTW(0, 11),
Chris@42 411 VTW(0, 12),
Chris@42 412 VTW(0, 13),
Chris@42 413 VTW(0, 14),
Chris@42 414 {TW_NEXT, VL, 0}
Chris@42 415 };
Chris@42 416
Chris@42 417 static const ct_desc desc = { 15, XSIMD_STRING("t1bv_15"), twinstr, &GENUS, {78, 39, 14, 0}, 0, 0, 0 };
Chris@42 418
Chris@42 419 void XSIMD(codelet_t1bv_15) (planner *p) {
Chris@42 420 X(kdft_dit_register) (p, t1bv_15, &desc);
Chris@42 421 }
Chris@42 422 #endif /* HAVE_FMA */