annotate src/fftw-3.3.5/dft/simd/common/t1sv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:45:05 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1sv_8 -include ts.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@42 32 * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
Chris@42 33 * 59 stack variables, 1 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "ts.h"
Chris@42 36
Chris@42 37 static void t1sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 {
Chris@42 41 INT m;
Chris@42 42 for (m = mb, W = W + (mb * 14); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 14), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 43 V T1, T1m, T1l, T7, TS, Tk, TQ, Te, To, Tr, Tu, T14, TF, Tx, T16;
Chris@42 44 V TL, Tt, TW, Tp, Tq, Tw;
Chris@42 45 {
Chris@42 46 V T3, T6, T2, T5;
Chris@42 47 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@42 48 T1m = LD(&(ii[0]), ms, &(ii[0]));
Chris@42 49 T3 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@42 50 T6 = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@42 51 T2 = LDW(&(W[TWVL * 6]));
Chris@42 52 T5 = LDW(&(W[TWVL * 7]));
Chris@42 53 {
Chris@42 54 V Tg, Tj, Ti, Ta, Td, T1k, T4, T9, Tc, TR, Th, Tf;
Chris@42 55 Tg = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@42 56 Tj = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@42 57 Tf = LDW(&(W[TWVL * 10]));
Chris@42 58 Ti = LDW(&(W[TWVL * 11]));
Chris@42 59 Ta = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@42 60 Td = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@42 61 T1k = VMUL(T2, T6);
Chris@42 62 T4 = VMUL(T2, T3);
Chris@42 63 T9 = LDW(&(W[TWVL * 2]));
Chris@42 64 Tc = LDW(&(W[TWVL * 3]));
Chris@42 65 TR = VMUL(Tf, Tj);
Chris@42 66 Th = VMUL(Tf, Tg);
Chris@42 67 {
Chris@42 68 V TB, TE, TH, TK, TG, TD, TJ, T13, TC, TA, TP, Tb, T15, TI, Tn;
Chris@42 69 TB = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@42 70 TE = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@42 71 T1l = VFNMS(T5, T3, T1k);
Chris@42 72 T7 = VFMA(T5, T6, T4);
Chris@42 73 TP = VMUL(T9, Td);
Chris@42 74 Tb = VMUL(T9, Ta);
Chris@42 75 TS = VFNMS(Ti, Tg, TR);
Chris@42 76 Tk = VFMA(Ti, Tj, Th);
Chris@42 77 TA = LDW(&(W[TWVL * 12]));
Chris@42 78 TH = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@42 79 TK = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@42 80 TG = LDW(&(W[TWVL * 4]));
Chris@42 81 TQ = VFNMS(Tc, Ta, TP);
Chris@42 82 Te = VFMA(Tc, Td, Tb);
Chris@42 83 TD = LDW(&(W[TWVL * 13]));
Chris@42 84 TJ = LDW(&(W[TWVL * 5]));
Chris@42 85 T13 = VMUL(TA, TE);
Chris@42 86 TC = VMUL(TA, TB);
Chris@42 87 To = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@42 88 T15 = VMUL(TG, TK);
Chris@42 89 TI = VMUL(TG, TH);
Chris@42 90 Tr = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@42 91 Tn = LDW(&(W[0]));
Chris@42 92 Tu = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@42 93 T14 = VFNMS(TD, TB, T13);
Chris@42 94 TF = VFMA(TD, TE, TC);
Chris@42 95 Tx = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@42 96 T16 = VFNMS(TJ, TH, T15);
Chris@42 97 TL = VFMA(TJ, TK, TI);
Chris@42 98 Tt = LDW(&(W[TWVL * 8]));
Chris@42 99 TW = VMUL(Tn, Tr);
Chris@42 100 Tp = VMUL(Tn, To);
Chris@42 101 Tq = LDW(&(W[TWVL * 1]));
Chris@42 102 Tw = LDW(&(W[TWVL * 9]));
Chris@42 103 }
Chris@42 104 }
Chris@42 105 }
Chris@42 106 {
Chris@42 107 V T8, T1g, TM, T1j, TX, Ts, T1n, T1r, T1s, Tl, T1c, T18, TZ, Ty, T1a;
Chris@42 108 V TU;
Chris@42 109 {
Chris@42 110 V TO, T17, T12, TY, Tv, TT;
Chris@42 111 T8 = VADD(T1, T7);
Chris@42 112 TO = VSUB(T1, T7);
Chris@42 113 T17 = VSUB(T14, T16);
Chris@42 114 T1g = VADD(T14, T16);
Chris@42 115 TM = VADD(TF, TL);
Chris@42 116 T12 = VSUB(TF, TL);
Chris@42 117 TY = VMUL(Tt, Tx);
Chris@42 118 Tv = VMUL(Tt, Tu);
Chris@42 119 TT = VSUB(TQ, TS);
Chris@42 120 T1j = VADD(TQ, TS);
Chris@42 121 TX = VFNMS(Tq, To, TW);
Chris@42 122 Ts = VFMA(Tq, Tr, Tp);
Chris@42 123 T1n = VADD(T1l, T1m);
Chris@42 124 T1r = VSUB(T1m, T1l);
Chris@42 125 T1s = VSUB(Te, Tk);
Chris@42 126 Tl = VADD(Te, Tk);
Chris@42 127 T1c = VADD(T12, T17);
Chris@42 128 T18 = VSUB(T12, T17);
Chris@42 129 TZ = VFNMS(Tw, Tu, TY);
Chris@42 130 Ty = VFMA(Tw, Tx, Tv);
Chris@42 131 T1a = VSUB(TO, TT);
Chris@42 132 TU = VADD(TO, TT);
Chris@42 133 }
Chris@42 134 {
Chris@42 135 V T1v, T1t, Tm, T1e, T1o, T1q, TN, T1p, T1d, T1u, T19, T1w, T1i, T1h;
Chris@42 136 {
Chris@42 137 V T10, T1f, Tz, TV, T11, T1b;
Chris@42 138 T1v = VADD(T1s, T1r);
Chris@42 139 T1t = VSUB(T1r, T1s);
Chris@42 140 T10 = VSUB(TX, TZ);
Chris@42 141 T1f = VADD(TX, TZ);
Chris@42 142 Tz = VADD(Ts, Ty);
Chris@42 143 TV = VSUB(Ts, Ty);
Chris@42 144 T11 = VADD(TV, T10);
Chris@42 145 T1b = VSUB(T10, TV);
Chris@42 146 Tm = VADD(T8, Tl);
Chris@42 147 T1e = VSUB(T8, Tl);
Chris@42 148 T1o = VADD(T1j, T1n);
Chris@42 149 T1q = VSUB(T1n, T1j);
Chris@42 150 TN = VADD(Tz, TM);
Chris@42 151 T1p = VSUB(TM, Tz);
Chris@42 152 T1d = VSUB(T1b, T1c);
Chris@42 153 T1u = VADD(T1b, T1c);
Chris@42 154 T19 = VADD(T11, T18);
Chris@42 155 T1w = VSUB(T18, T11);
Chris@42 156 T1i = VADD(T1f, T1g);
Chris@42 157 T1h = VSUB(T1f, T1g);
Chris@42 158 }
Chris@42 159 ST(&(ii[WS(rs, 6)]), VSUB(T1q, T1p), ms, &(ii[0]));
Chris@42 160 ST(&(ri[0]), VADD(Tm, TN), ms, &(ri[0]));
Chris@42 161 ST(&(ri[WS(rs, 4)]), VSUB(Tm, TN), ms, &(ri[0]));
Chris@42 162 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP707106781), T1u, T1t), ms, &(ii[WS(rs, 1)]));
Chris@42 163 ST(&(ii[WS(rs, 5)]), VFNMS(LDK(KP707106781), T1u, T1t), ms, &(ii[WS(rs, 1)]));
Chris@42 164 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP707106781), T1d, T1a), ms, &(ri[WS(rs, 1)]));
Chris@42 165 ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1d, T1a), ms, &(ri[WS(rs, 1)]));
Chris@42 166 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP707106781), T1w, T1v), ms, &(ii[WS(rs, 1)]));
Chris@42 167 ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1w, T1v), ms, &(ii[WS(rs, 1)]));
Chris@42 168 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP707106781), T19, TU), ms, &(ri[WS(rs, 1)]));
Chris@42 169 ST(&(ri[WS(rs, 5)]), VFNMS(LDK(KP707106781), T19, TU), ms, &(ri[WS(rs, 1)]));
Chris@42 170 ST(&(ri[WS(rs, 6)]), VSUB(T1e, T1h), ms, &(ri[0]));
Chris@42 171 ST(&(ii[0]), VADD(T1i, T1o), ms, &(ii[0]));
Chris@42 172 ST(&(ii[WS(rs, 4)]), VSUB(T1o, T1i), ms, &(ii[0]));
Chris@42 173 ST(&(ri[WS(rs, 2)]), VADD(T1e, T1h), ms, &(ri[0]));
Chris@42 174 ST(&(ii[WS(rs, 2)]), VADD(T1p, T1q), ms, &(ii[0]));
Chris@42 175 }
Chris@42 176 }
Chris@42 177 }
Chris@42 178 }
Chris@42 179 VLEAVE();
Chris@42 180 }
Chris@42 181
Chris@42 182 static const tw_instr twinstr[] = {
Chris@42 183 VTW(0, 1),
Chris@42 184 VTW(0, 2),
Chris@42 185 VTW(0, 3),
Chris@42 186 VTW(0, 4),
Chris@42 187 VTW(0, 5),
Chris@42 188 VTW(0, 6),
Chris@42 189 VTW(0, 7),
Chris@42 190 {TW_NEXT, (2 * VL), 0}
Chris@42 191 };
Chris@42 192
Chris@42 193 static const ct_desc desc = { 8, XSIMD_STRING("t1sv_8"), twinstr, &GENUS, {44, 14, 22, 0}, 0, 0, 0 };
Chris@42 194
Chris@42 195 void XSIMD(codelet_t1sv_8) (planner *p) {
Chris@42 196 X(kdft_dit_register) (p, t1sv_8, &desc);
Chris@42 197 }
Chris@42 198 #else /* HAVE_FMA */
Chris@42 199
Chris@42 200 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1sv_8 -include ts.h */
Chris@42 201
Chris@42 202 /*
Chris@42 203 * This function contains 66 FP additions, 32 FP multiplications,
Chris@42 204 * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
Chris@42 205 * 28 stack variables, 1 constants, and 32 memory accesses
Chris@42 206 */
Chris@42 207 #include "ts.h"
Chris@42 208
Chris@42 209 static void t1sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 210 {
Chris@42 211 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 212 {
Chris@42 213 INT m;
Chris@42 214 for (m = mb, W = W + (mb * 14); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 14), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 215 V T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
Chris@42 216 V TP;
Chris@42 217 {
Chris@42 218 V T1, T18, T6, T17;
Chris@42 219 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@42 220 T18 = LD(&(ii[0]), ms, &(ii[0]));
Chris@42 221 {
Chris@42 222 V T3, T5, T2, T4;
Chris@42 223 T3 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@42 224 T5 = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@42 225 T2 = LDW(&(W[TWVL * 6]));
Chris@42 226 T4 = LDW(&(W[TWVL * 7]));
Chris@42 227 T6 = VFMA(T2, T3, VMUL(T4, T5));
Chris@42 228 T17 = VFNMS(T4, T3, VMUL(T2, T5));
Chris@42 229 }
Chris@42 230 T7 = VADD(T1, T6);
Chris@42 231 T1e = VSUB(T18, T17);
Chris@42 232 TH = VSUB(T1, T6);
Chris@42 233 T19 = VADD(T17, T18);
Chris@42 234 }
Chris@42 235 {
Chris@42 236 V Tz, TS, TE, TT;
Chris@42 237 {
Chris@42 238 V Tw, Ty, Tv, Tx;
Chris@42 239 Tw = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@42 240 Ty = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@42 241 Tv = LDW(&(W[TWVL * 12]));
Chris@42 242 Tx = LDW(&(W[TWVL * 13]));
Chris@42 243 Tz = VFMA(Tv, Tw, VMUL(Tx, Ty));
Chris@42 244 TS = VFNMS(Tx, Tw, VMUL(Tv, Ty));
Chris@42 245 }
Chris@42 246 {
Chris@42 247 V TB, TD, TA, TC;
Chris@42 248 TB = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@42 249 TD = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@42 250 TA = LDW(&(W[TWVL * 4]));
Chris@42 251 TC = LDW(&(W[TWVL * 5]));
Chris@42 252 TE = VFMA(TA, TB, VMUL(TC, TD));
Chris@42 253 TT = VFNMS(TC, TB, VMUL(TA, TD));
Chris@42 254 }
Chris@42 255 TF = VADD(Tz, TE);
Chris@42 256 T13 = VADD(TS, TT);
Chris@42 257 TR = VSUB(Tz, TE);
Chris@42 258 TU = VSUB(TS, TT);
Chris@42 259 }
Chris@42 260 {
Chris@42 261 V Tc, TI, Th, TJ;
Chris@42 262 {
Chris@42 263 V T9, Tb, T8, Ta;
Chris@42 264 T9 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@42 265 Tb = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@42 266 T8 = LDW(&(W[TWVL * 2]));
Chris@42 267 Ta = LDW(&(W[TWVL * 3]));
Chris@42 268 Tc = VFMA(T8, T9, VMUL(Ta, Tb));
Chris@42 269 TI = VFNMS(Ta, T9, VMUL(T8, Tb));
Chris@42 270 }
Chris@42 271 {
Chris@42 272 V Te, Tg, Td, Tf;
Chris@42 273 Te = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@42 274 Tg = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@42 275 Td = LDW(&(W[TWVL * 10]));
Chris@42 276 Tf = LDW(&(W[TWVL * 11]));
Chris@42 277 Th = VFMA(Td, Te, VMUL(Tf, Tg));
Chris@42 278 TJ = VFNMS(Tf, Te, VMUL(Td, Tg));
Chris@42 279 }
Chris@42 280 Ti = VADD(Tc, Th);
Chris@42 281 T1f = VSUB(Tc, Th);
Chris@42 282 TK = VSUB(TI, TJ);
Chris@42 283 T16 = VADD(TI, TJ);
Chris@42 284 }
Chris@42 285 {
Chris@42 286 V To, TN, Tt, TO;
Chris@42 287 {
Chris@42 288 V Tl, Tn, Tk, Tm;
Chris@42 289 Tl = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@42 290 Tn = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@42 291 Tk = LDW(&(W[0]));
Chris@42 292 Tm = LDW(&(W[TWVL * 1]));
Chris@42 293 To = VFMA(Tk, Tl, VMUL(Tm, Tn));
Chris@42 294 TN = VFNMS(Tm, Tl, VMUL(Tk, Tn));
Chris@42 295 }
Chris@42 296 {
Chris@42 297 V Tq, Ts, Tp, Tr;
Chris@42 298 Tq = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@42 299 Ts = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@42 300 Tp = LDW(&(W[TWVL * 8]));
Chris@42 301 Tr = LDW(&(W[TWVL * 9]));
Chris@42 302 Tt = VFMA(Tp, Tq, VMUL(Tr, Ts));
Chris@42 303 TO = VFNMS(Tr, Tq, VMUL(Tp, Ts));
Chris@42 304 }
Chris@42 305 Tu = VADD(To, Tt);
Chris@42 306 T12 = VADD(TN, TO);
Chris@42 307 TM = VSUB(To, Tt);
Chris@42 308 TP = VSUB(TN, TO);
Chris@42 309 }
Chris@42 310 {
Chris@42 311 V Tj, TG, T1b, T1c;
Chris@42 312 Tj = VADD(T7, Ti);
Chris@42 313 TG = VADD(Tu, TF);
Chris@42 314 ST(&(ri[WS(rs, 4)]), VSUB(Tj, TG), ms, &(ri[0]));
Chris@42 315 ST(&(ri[0]), VADD(Tj, TG), ms, &(ri[0]));
Chris@42 316 {
Chris@42 317 V T15, T1a, T11, T14;
Chris@42 318 T15 = VADD(T12, T13);
Chris@42 319 T1a = VADD(T16, T19);
Chris@42 320 ST(&(ii[0]), VADD(T15, T1a), ms, &(ii[0]));
Chris@42 321 ST(&(ii[WS(rs, 4)]), VSUB(T1a, T15), ms, &(ii[0]));
Chris@42 322 T11 = VSUB(T7, Ti);
Chris@42 323 T14 = VSUB(T12, T13);
Chris@42 324 ST(&(ri[WS(rs, 6)]), VSUB(T11, T14), ms, &(ri[0]));
Chris@42 325 ST(&(ri[WS(rs, 2)]), VADD(T11, T14), ms, &(ri[0]));
Chris@42 326 }
Chris@42 327 T1b = VSUB(TF, Tu);
Chris@42 328 T1c = VSUB(T19, T16);
Chris@42 329 ST(&(ii[WS(rs, 2)]), VADD(T1b, T1c), ms, &(ii[0]));
Chris@42 330 ST(&(ii[WS(rs, 6)]), VSUB(T1c, T1b), ms, &(ii[0]));
Chris@42 331 {
Chris@42 332 V TX, T1g, T10, T1d, TY, TZ;
Chris@42 333 TX = VSUB(TH, TK);
Chris@42 334 T1g = VSUB(T1e, T1f);
Chris@42 335 TY = VSUB(TP, TM);
Chris@42 336 TZ = VADD(TR, TU);
Chris@42 337 T10 = VMUL(LDK(KP707106781), VSUB(TY, TZ));
Chris@42 338 T1d = VMUL(LDK(KP707106781), VADD(TY, TZ));
Chris@42 339 ST(&(ri[WS(rs, 7)]), VSUB(TX, T10), ms, &(ri[WS(rs, 1)]));
Chris@42 340 ST(&(ii[WS(rs, 5)]), VSUB(T1g, T1d), ms, &(ii[WS(rs, 1)]));
Chris@42 341 ST(&(ri[WS(rs, 3)]), VADD(TX, T10), ms, &(ri[WS(rs, 1)]));
Chris@42 342 ST(&(ii[WS(rs, 1)]), VADD(T1d, T1g), ms, &(ii[WS(rs, 1)]));
Chris@42 343 }
Chris@42 344 {
Chris@42 345 V TL, T1i, TW, T1h, TQ, TV;
Chris@42 346 TL = VADD(TH, TK);
Chris@42 347 T1i = VADD(T1f, T1e);
Chris@42 348 TQ = VADD(TM, TP);
Chris@42 349 TV = VSUB(TR, TU);
Chris@42 350 TW = VMUL(LDK(KP707106781), VADD(TQ, TV));
Chris@42 351 T1h = VMUL(LDK(KP707106781), VSUB(TV, TQ));
Chris@42 352 ST(&(ri[WS(rs, 5)]), VSUB(TL, TW), ms, &(ri[WS(rs, 1)]));
Chris@42 353 ST(&(ii[WS(rs, 7)]), VSUB(T1i, T1h), ms, &(ii[WS(rs, 1)]));
Chris@42 354 ST(&(ri[WS(rs, 1)]), VADD(TL, TW), ms, &(ri[WS(rs, 1)]));
Chris@42 355 ST(&(ii[WS(rs, 3)]), VADD(T1h, T1i), ms, &(ii[WS(rs, 1)]));
Chris@42 356 }
Chris@42 357 }
Chris@42 358 }
Chris@42 359 }
Chris@42 360 VLEAVE();
Chris@42 361 }
Chris@42 362
Chris@42 363 static const tw_instr twinstr[] = {
Chris@42 364 VTW(0, 1),
Chris@42 365 VTW(0, 2),
Chris@42 366 VTW(0, 3),
Chris@42 367 VTW(0, 4),
Chris@42 368 VTW(0, 5),
Chris@42 369 VTW(0, 6),
Chris@42 370 VTW(0, 7),
Chris@42 371 {TW_NEXT, (2 * VL), 0}
Chris@42 372 };
Chris@42 373
Chris@42 374 static const ct_desc desc = { 8, XSIMD_STRING("t1sv_8"), twinstr, &GENUS, {52, 18, 14, 0}, 0, 0, 0 };
Chris@42 375
Chris@42 376 void XSIMD(codelet_t1sv_8) (planner *p) {
Chris@42 377 X(kdft_dit_register) (p, t1sv_8, &desc);
Chris@42 378 }
Chris@42 379 #endif /* HAVE_FMA */