annotate src/fftw-3.3.5/dft/simd/common/t1fv_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:42:02 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name t1fv_15 -include t1f.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 92 FP additions, 77 FP multiplications,
Chris@42 32 * (or, 50 additions, 35 multiplications, 42 fused multiply/add),
Chris@42 33 * 81 stack variables, 8 constants, and 30 memory accesses
Chris@42 34 */
Chris@42 35 #include "t1f.h"
Chris@42 36
Chris@42 37 static void t1fv_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
Chris@42 40 DVK(KP910592997, +0.910592997310029334643087372129977886038870291);
Chris@42 41 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 43 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 44 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 45 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 46 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 47 {
Chris@42 48 INT m;
Chris@42 49 R *x;
Chris@42 50 x = ri;
Chris@42 51 for (m = mb, W = W + (mb * ((TWVL / VL) * 28)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 28), MAKE_VOLATILE_STRIDE(15, rs)) {
Chris@42 52 V Tq, Ty, Th, T1b, T10, Ts, TP, T7, Tu, TA, TC, Tj, Tk, TQ, Tf;
Chris@42 53 {
Chris@42 54 V T1, T4, T2, T9, Te;
Chris@42 55 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 56 T4 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 57 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 58 {
Chris@42 59 V T8, Tp, Tx, Tg;
Chris@42 60 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 61 Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 62 Tx = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 63 Tg = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 64 {
Chris@42 65 V Tb, Td, Tr, T6, Tt, Tz, TB, Ti;
Chris@42 66 {
Chris@42 67 V T5, T3, Ta, Tc;
Chris@42 68 Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 69 Tc = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 70 T5 = BYTWJ(&(W[TWVL * 18]), T4);
Chris@42 71 T3 = BYTWJ(&(W[TWVL * 8]), T2);
Chris@42 72 T9 = BYTWJ(&(W[TWVL * 4]), T8);
Chris@42 73 Tq = BYTWJ(&(W[TWVL * 10]), Tp);
Chris@42 74 Ty = BYTWJ(&(W[TWVL * 16]), Tx);
Chris@42 75 Th = BYTWJ(&(W[TWVL * 22]), Tg);
Chris@42 76 Tb = BYTWJ(&(W[TWVL * 14]), Ta);
Chris@42 77 Td = BYTWJ(&(W[TWVL * 24]), Tc);
Chris@42 78 Tr = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 79 T1b = VSUB(T5, T3);
Chris@42 80 T6 = VADD(T3, T5);
Chris@42 81 Tt = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 82 }
Chris@42 83 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 84 TB = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 85 Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 86 Te = VADD(Tb, Td);
Chris@42 87 T10 = VSUB(Td, Tb);
Chris@42 88 Ts = BYTWJ(&(W[TWVL * 20]), Tr);
Chris@42 89 TP = VFNMS(LDK(KP500000000), T6, T1);
Chris@42 90 T7 = VADD(T1, T6);
Chris@42 91 Tu = BYTWJ(&(W[0]), Tt);
Chris@42 92 TA = BYTWJ(&(W[TWVL * 26]), Tz);
Chris@42 93 TC = BYTWJ(&(W[TWVL * 6]), TB);
Chris@42 94 Tj = BYTWJ(&(W[TWVL * 2]), Ti);
Chris@42 95 Tk = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 96 }
Chris@42 97 }
Chris@42 98 TQ = VFNMS(LDK(KP500000000), Te, T9);
Chris@42 99 Tf = VADD(T9, Te);
Chris@42 100 }
Chris@42 101 {
Chris@42 102 V Tv, T13, TD, T14, Tl;
Chris@42 103 Tv = VADD(Ts, Tu);
Chris@42 104 T13 = VSUB(Tu, Ts);
Chris@42 105 TD = VADD(TA, TC);
Chris@42 106 T14 = VSUB(TC, TA);
Chris@42 107 Tl = BYTWJ(&(W[TWVL * 12]), Tk);
Chris@42 108 {
Chris@42 109 V TT, Tw, T1d, T15, TU, TE, T11, Tm;
Chris@42 110 TT = VFNMS(LDK(KP500000000), Tv, Tq);
Chris@42 111 Tw = VADD(Tq, Tv);
Chris@42 112 T1d = VADD(T13, T14);
Chris@42 113 T15 = VSUB(T13, T14);
Chris@42 114 TU = VFNMS(LDK(KP500000000), TD, Ty);
Chris@42 115 TE = VADD(Ty, TD);
Chris@42 116 T11 = VSUB(Tl, Tj);
Chris@42 117 Tm = VADD(Tj, Tl);
Chris@42 118 {
Chris@42 119 V T19, TV, TK, TF, T1c, T12, TR, Tn;
Chris@42 120 T19 = VSUB(TT, TU);
Chris@42 121 TV = VADD(TT, TU);
Chris@42 122 TK = VSUB(Tw, TE);
Chris@42 123 TF = VADD(Tw, TE);
Chris@42 124 T1c = VADD(T10, T11);
Chris@42 125 T12 = VSUB(T10, T11);
Chris@42 126 TR = VFNMS(LDK(KP500000000), Tm, Th);
Chris@42 127 Tn = VADD(Th, Tm);
Chris@42 128 {
Chris@42 129 V T1g, T1e, T1m, T16, T18, TS, TL, To, T1f, T1u;
Chris@42 130 T1g = VSUB(T1c, T1d);
Chris@42 131 T1e = VADD(T1c, T1d);
Chris@42 132 T1m = VFNMS(LDK(KP618033988), T12, T15);
Chris@42 133 T16 = VFMA(LDK(KP618033988), T15, T12);
Chris@42 134 T18 = VSUB(TQ, TR);
Chris@42 135 TS = VADD(TQ, TR);
Chris@42 136 TL = VSUB(Tf, Tn);
Chris@42 137 To = VADD(Tf, Tn);
Chris@42 138 T1f = VFNMS(LDK(KP250000000), T1e, T1b);
Chris@42 139 T1u = VMUL(LDK(KP866025403), VADD(T1b, T1e));
Chris@42 140 {
Chris@42 141 V T1o, T1a, TY, TO, TM, TG, TI, T1p, T1h, T1t, TX, TW;
Chris@42 142 T1o = VFNMS(LDK(KP618033988), T18, T19);
Chris@42 143 T1a = VFMA(LDK(KP618033988), T19, T18);
Chris@42 144 TW = VADD(TS, TV);
Chris@42 145 TY = VSUB(TS, TV);
Chris@42 146 TO = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TK, TL));
Chris@42 147 TM = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TL, TK));
Chris@42 148 TG = VADD(To, TF);
Chris@42 149 TI = VSUB(To, TF);
Chris@42 150 T1p = VFNMS(LDK(KP559016994), T1g, T1f);
Chris@42 151 T1h = VFMA(LDK(KP559016994), T1g, T1f);
Chris@42 152 T1t = VADD(TP, TW);
Chris@42 153 TX = VFNMS(LDK(KP250000000), TW, TP);
Chris@42 154 {
Chris@42 155 V T1q, T1s, T1k, T1i, T1l, TZ, TJ, TN, TH;
Chris@42 156 ST(&(x[0]), VADD(T7, TG), ms, &(x[0]));
Chris@42 157 TH = VFNMS(LDK(KP250000000), TG, T7);
Chris@42 158 T1q = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T1p, T1o));
Chris@42 159 T1s = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T1p, T1o));
Chris@42 160 T1k = VMUL(LDK(KP951056516), VFMA(LDK(KP910592997), T1h, T1a));
Chris@42 161 T1i = VMUL(LDK(KP951056516), VFNMS(LDK(KP910592997), T1h, T1a));
Chris@42 162 ST(&(x[WS(rs, 10)]), VFMAI(T1u, T1t), ms, &(x[0]));
Chris@42 163 ST(&(x[WS(rs, 5)]), VFNMSI(T1u, T1t), ms, &(x[WS(rs, 1)]));
Chris@42 164 T1l = VFNMS(LDK(KP559016994), TY, TX);
Chris@42 165 TZ = VFMA(LDK(KP559016994), TY, TX);
Chris@42 166 TJ = VFNMS(LDK(KP559016994), TI, TH);
Chris@42 167 TN = VFMA(LDK(KP559016994), TI, TH);
Chris@42 168 {
Chris@42 169 V T1n, T1r, T1j, T17;
Chris@42 170 T1n = VFMA(LDK(KP823639103), T1m, T1l);
Chris@42 171 T1r = VFNMS(LDK(KP823639103), T1m, T1l);
Chris@42 172 T1j = VFNMS(LDK(KP823639103), T16, TZ);
Chris@42 173 T17 = VFMA(LDK(KP823639103), T16, TZ);
Chris@42 174 ST(&(x[WS(rs, 12)]), VFMAI(TM, TJ), ms, &(x[0]));
Chris@42 175 ST(&(x[WS(rs, 3)]), VFNMSI(TM, TJ), ms, &(x[WS(rs, 1)]));
Chris@42 176 ST(&(x[WS(rs, 9)]), VFMAI(TO, TN), ms, &(x[WS(rs, 1)]));
Chris@42 177 ST(&(x[WS(rs, 6)]), VFNMSI(TO, TN), ms, &(x[0]));
Chris@42 178 ST(&(x[WS(rs, 2)]), VFMAI(T1q, T1n), ms, &(x[0]));
Chris@42 179 ST(&(x[WS(rs, 13)]), VFNMSI(T1q, T1n), ms, &(x[WS(rs, 1)]));
Chris@42 180 ST(&(x[WS(rs, 7)]), VFMAI(T1s, T1r), ms, &(x[WS(rs, 1)]));
Chris@42 181 ST(&(x[WS(rs, 8)]), VFNMSI(T1s, T1r), ms, &(x[0]));
Chris@42 182 ST(&(x[WS(rs, 4)]), VFMAI(T1k, T1j), ms, &(x[0]));
Chris@42 183 ST(&(x[WS(rs, 11)]), VFNMSI(T1k, T1j), ms, &(x[WS(rs, 1)]));
Chris@42 184 ST(&(x[WS(rs, 14)]), VFMAI(T1i, T17), ms, &(x[0]));
Chris@42 185 ST(&(x[WS(rs, 1)]), VFNMSI(T1i, T17), ms, &(x[WS(rs, 1)]));
Chris@42 186 }
Chris@42 187 }
Chris@42 188 }
Chris@42 189 }
Chris@42 190 }
Chris@42 191 }
Chris@42 192 }
Chris@42 193 }
Chris@42 194 }
Chris@42 195 VLEAVE();
Chris@42 196 }
Chris@42 197
Chris@42 198 static const tw_instr twinstr[] = {
Chris@42 199 VTW(0, 1),
Chris@42 200 VTW(0, 2),
Chris@42 201 VTW(0, 3),
Chris@42 202 VTW(0, 4),
Chris@42 203 VTW(0, 5),
Chris@42 204 VTW(0, 6),
Chris@42 205 VTW(0, 7),
Chris@42 206 VTW(0, 8),
Chris@42 207 VTW(0, 9),
Chris@42 208 VTW(0, 10),
Chris@42 209 VTW(0, 11),
Chris@42 210 VTW(0, 12),
Chris@42 211 VTW(0, 13),
Chris@42 212 VTW(0, 14),
Chris@42 213 {TW_NEXT, VL, 0}
Chris@42 214 };
Chris@42 215
Chris@42 216 static const ct_desc desc = { 15, XSIMD_STRING("t1fv_15"), twinstr, &GENUS, {50, 35, 42, 0}, 0, 0, 0 };
Chris@42 217
Chris@42 218 void XSIMD(codelet_t1fv_15) (planner *p) {
Chris@42 219 X(kdft_dit_register) (p, t1fv_15, &desc);
Chris@42 220 }
Chris@42 221 #else /* HAVE_FMA */
Chris@42 222
Chris@42 223 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 15 -name t1fv_15 -include t1f.h */
Chris@42 224
Chris@42 225 /*
Chris@42 226 * This function contains 92 FP additions, 53 FP multiplications,
Chris@42 227 * (or, 78 additions, 39 multiplications, 14 fused multiply/add),
Chris@42 228 * 52 stack variables, 10 constants, and 30 memory accesses
Chris@42 229 */
Chris@42 230 #include "t1f.h"
Chris@42 231
Chris@42 232 static void t1fv_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 233 {
Chris@42 234 DVK(KP216506350, +0.216506350946109661690930792688234045867850657);
Chris@42 235 DVK(KP484122918, +0.484122918275927110647408174972799951354115213);
Chris@42 236 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 237 DVK(KP509036960, +0.509036960455127183450980863393907648510733164);
Chris@42 238 DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
Chris@42 239 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 240 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 241 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 242 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 243 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 244 {
Chris@42 245 INT m;
Chris@42 246 R *x;
Chris@42 247 x = ri;
Chris@42 248 for (m = mb, W = W + (mb * ((TWVL / VL) * 28)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 28), MAKE_VOLATILE_STRIDE(15, rs)) {
Chris@42 249 V T1e, T7, TP, T12, T15, Tf, Tn, To, T1b, T1c, T1f, TQ, TR, TS, Tw;
Chris@42 250 V TE, TF, TT, TU, TV;
Chris@42 251 {
Chris@42 252 V T1, T5, T3, T4, T2, T6;
Chris@42 253 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 254 T4 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 255 T5 = BYTWJ(&(W[TWVL * 18]), T4);
Chris@42 256 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 257 T3 = BYTWJ(&(W[TWVL * 8]), T2);
Chris@42 258 T1e = VSUB(T5, T3);
Chris@42 259 T6 = VADD(T3, T5);
Chris@42 260 T7 = VADD(T1, T6);
Chris@42 261 TP = VFNMS(LDK(KP500000000), T6, T1);
Chris@42 262 }
Chris@42 263 {
Chris@42 264 V T9, Tq, Ty, Th, Te, T13, Tv, T10, TD, T11, Tm, T14;
Chris@42 265 {
Chris@42 266 V T8, Tp, Tx, Tg;
Chris@42 267 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 268 T9 = BYTWJ(&(W[TWVL * 4]), T8);
Chris@42 269 Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 270 Tq = BYTWJ(&(W[TWVL * 10]), Tp);
Chris@42 271 Tx = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 272 Ty = BYTWJ(&(W[TWVL * 16]), Tx);
Chris@42 273 Tg = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 274 Th = BYTWJ(&(W[TWVL * 22]), Tg);
Chris@42 275 }
Chris@42 276 {
Chris@42 277 V Tb, Td, Ta, Tc;
Chris@42 278 Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 279 Tb = BYTWJ(&(W[TWVL * 14]), Ta);
Chris@42 280 Tc = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 281 Td = BYTWJ(&(W[TWVL * 24]), Tc);
Chris@42 282 Te = VADD(Tb, Td);
Chris@42 283 T13 = VSUB(Td, Tb);
Chris@42 284 }
Chris@42 285 {
Chris@42 286 V Ts, Tu, Tr, Tt;
Chris@42 287 Tr = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 288 Ts = BYTWJ(&(W[TWVL * 20]), Tr);
Chris@42 289 Tt = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 290 Tu = BYTWJ(&(W[0]), Tt);
Chris@42 291 Tv = VADD(Ts, Tu);
Chris@42 292 T10 = VSUB(Tu, Ts);
Chris@42 293 }
Chris@42 294 {
Chris@42 295 V TA, TC, Tz, TB;
Chris@42 296 Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 297 TA = BYTWJ(&(W[TWVL * 26]), Tz);
Chris@42 298 TB = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 299 TC = BYTWJ(&(W[TWVL * 6]), TB);
Chris@42 300 TD = VADD(TA, TC);
Chris@42 301 T11 = VSUB(TC, TA);
Chris@42 302 }
Chris@42 303 {
Chris@42 304 V Tj, Tl, Ti, Tk;
Chris@42 305 Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 306 Tj = BYTWJ(&(W[TWVL * 2]), Ti);
Chris@42 307 Tk = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 308 Tl = BYTWJ(&(W[TWVL * 12]), Tk);
Chris@42 309 Tm = VADD(Tj, Tl);
Chris@42 310 T14 = VSUB(Tl, Tj);
Chris@42 311 }
Chris@42 312 T12 = VSUB(T10, T11);
Chris@42 313 T15 = VSUB(T13, T14);
Chris@42 314 Tf = VADD(T9, Te);
Chris@42 315 Tn = VADD(Th, Tm);
Chris@42 316 To = VADD(Tf, Tn);
Chris@42 317 T1b = VADD(T13, T14);
Chris@42 318 T1c = VADD(T10, T11);
Chris@42 319 T1f = VADD(T1b, T1c);
Chris@42 320 TQ = VFNMS(LDK(KP500000000), Te, T9);
Chris@42 321 TR = VFNMS(LDK(KP500000000), Tm, Th);
Chris@42 322 TS = VADD(TQ, TR);
Chris@42 323 Tw = VADD(Tq, Tv);
Chris@42 324 TE = VADD(Ty, TD);
Chris@42 325 TF = VADD(Tw, TE);
Chris@42 326 TT = VFNMS(LDK(KP500000000), Tv, Tq);
Chris@42 327 TU = VFNMS(LDK(KP500000000), TD, Ty);
Chris@42 328 TV = VADD(TT, TU);
Chris@42 329 }
Chris@42 330 {
Chris@42 331 V TI, TG, TH, TM, TO, TK, TL, TN, TJ;
Chris@42 332 TI = VMUL(LDK(KP559016994), VSUB(To, TF));
Chris@42 333 TG = VADD(To, TF);
Chris@42 334 TH = VFNMS(LDK(KP250000000), TG, T7);
Chris@42 335 TK = VSUB(Tw, TE);
Chris@42 336 TL = VSUB(Tf, Tn);
Chris@42 337 TM = VBYI(VFNMS(LDK(KP587785252), TL, VMUL(LDK(KP951056516), TK)));
Chris@42 338 TO = VBYI(VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TK)));
Chris@42 339 ST(&(x[0]), VADD(T7, TG), ms, &(x[0]));
Chris@42 340 TN = VADD(TI, TH);
Chris@42 341 ST(&(x[WS(rs, 6)]), VSUB(TN, TO), ms, &(x[0]));
Chris@42 342 ST(&(x[WS(rs, 9)]), VADD(TO, TN), ms, &(x[WS(rs, 1)]));
Chris@42 343 TJ = VSUB(TH, TI);
Chris@42 344 ST(&(x[WS(rs, 3)]), VSUB(TJ, TM), ms, &(x[WS(rs, 1)]));
Chris@42 345 ST(&(x[WS(rs, 12)]), VADD(TM, TJ), ms, &(x[0]));
Chris@42 346 }
Chris@42 347 {
Chris@42 348 V T16, T1m, T1u, T1h, T1o, T1a, T1p, TZ, T1t, T1l, T1d, T1g;
Chris@42 349 T16 = VFNMS(LDK(KP509036960), T15, VMUL(LDK(KP823639103), T12));
Chris@42 350 T1m = VFMA(LDK(KP823639103), T15, VMUL(LDK(KP509036960), T12));
Chris@42 351 T1u = VBYI(VMUL(LDK(KP866025403), VADD(T1e, T1f)));
Chris@42 352 T1d = VMUL(LDK(KP484122918), VSUB(T1b, T1c));
Chris@42 353 T1g = VFNMS(LDK(KP216506350), T1f, VMUL(LDK(KP866025403), T1e));
Chris@42 354 T1h = VSUB(T1d, T1g);
Chris@42 355 T1o = VADD(T1d, T1g);
Chris@42 356 {
Chris@42 357 V T18, T19, TY, TW, TX;
Chris@42 358 T18 = VSUB(TT, TU);
Chris@42 359 T19 = VSUB(TQ, TR);
Chris@42 360 T1a = VFNMS(LDK(KP587785252), T19, VMUL(LDK(KP951056516), T18));
Chris@42 361 T1p = VFMA(LDK(KP951056516), T19, VMUL(LDK(KP587785252), T18));
Chris@42 362 TY = VMUL(LDK(KP559016994), VSUB(TS, TV));
Chris@42 363 TW = VADD(TS, TV);
Chris@42 364 TX = VFNMS(LDK(KP250000000), TW, TP);
Chris@42 365 TZ = VSUB(TX, TY);
Chris@42 366 T1t = VADD(TP, TW);
Chris@42 367 T1l = VADD(TY, TX);
Chris@42 368 }
Chris@42 369 {
Chris@42 370 V T17, T1i, T1r, T1s;
Chris@42 371 ST(&(x[WS(rs, 5)]), VSUB(T1t, T1u), ms, &(x[WS(rs, 1)]));
Chris@42 372 ST(&(x[WS(rs, 10)]), VADD(T1t, T1u), ms, &(x[0]));
Chris@42 373 T17 = VSUB(TZ, T16);
Chris@42 374 T1i = VBYI(VSUB(T1a, T1h));
Chris@42 375 ST(&(x[WS(rs, 8)]), VSUB(T17, T1i), ms, &(x[0]));
Chris@42 376 ST(&(x[WS(rs, 7)]), VADD(T17, T1i), ms, &(x[WS(rs, 1)]));
Chris@42 377 T1r = VSUB(T1l, T1m);
Chris@42 378 T1s = VBYI(VADD(T1p, T1o));
Chris@42 379 ST(&(x[WS(rs, 11)]), VSUB(T1r, T1s), ms, &(x[WS(rs, 1)]));
Chris@42 380 ST(&(x[WS(rs, 4)]), VADD(T1r, T1s), ms, &(x[0]));
Chris@42 381 {
Chris@42 382 V T1n, T1q, T1j, T1k;
Chris@42 383 T1n = VADD(T1l, T1m);
Chris@42 384 T1q = VBYI(VSUB(T1o, T1p));
Chris@42 385 ST(&(x[WS(rs, 14)]), VSUB(T1n, T1q), ms, &(x[0]));
Chris@42 386 ST(&(x[WS(rs, 1)]), VADD(T1n, T1q), ms, &(x[WS(rs, 1)]));
Chris@42 387 T1j = VADD(TZ, T16);
Chris@42 388 T1k = VBYI(VADD(T1a, T1h));
Chris@42 389 ST(&(x[WS(rs, 13)]), VSUB(T1j, T1k), ms, &(x[WS(rs, 1)]));
Chris@42 390 ST(&(x[WS(rs, 2)]), VADD(T1j, T1k), ms, &(x[0]));
Chris@42 391 }
Chris@42 392 }
Chris@42 393 }
Chris@42 394 }
Chris@42 395 }
Chris@42 396 VLEAVE();
Chris@42 397 }
Chris@42 398
Chris@42 399 static const tw_instr twinstr[] = {
Chris@42 400 VTW(0, 1),
Chris@42 401 VTW(0, 2),
Chris@42 402 VTW(0, 3),
Chris@42 403 VTW(0, 4),
Chris@42 404 VTW(0, 5),
Chris@42 405 VTW(0, 6),
Chris@42 406 VTW(0, 7),
Chris@42 407 VTW(0, 8),
Chris@42 408 VTW(0, 9),
Chris@42 409 VTW(0, 10),
Chris@42 410 VTW(0, 11),
Chris@42 411 VTW(0, 12),
Chris@42 412 VTW(0, 13),
Chris@42 413 VTW(0, 14),
Chris@42 414 {TW_NEXT, VL, 0}
Chris@42 415 };
Chris@42 416
Chris@42 417 static const ct_desc desc = { 15, XSIMD_STRING("t1fv_15"), twinstr, &GENUS, {78, 39, 14, 0}, 0, 0, 0 };
Chris@42 418
Chris@42 419 void XSIMD(codelet_t1fv_15) (planner *p) {
Chris@42 420 X(kdft_dit_register) (p, t1fv_15, &desc);
Chris@42 421 }
Chris@42 422 #endif /* HAVE_FMA */