annotate src/fftw-3.3.3/dft/simd/common/t2sv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:39:26 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 8 -name t2sv_8 -include ts.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 74 FP additions, 50 FP multiplications,
Chris@10 32 * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
Chris@10 33 * 64 stack variables, 1 constants, and 32 memory accesses
Chris@10 34 */
Chris@10 35 #include "ts.h"
Chris@10 36
Chris@10 37 static void t2sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 40 {
Chris@10 41 INT m;
Chris@10 42 for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@10 43 V T1m, T1l, T1k, T1u, T1n, T1o;
Chris@10 44 {
Chris@10 45 V T2, T3, Tl, Tn, T5, T6;
Chris@10 46 T2 = LDW(&(W[0]));
Chris@10 47 T3 = LDW(&(W[TWVL * 2]));
Chris@10 48 Tl = LDW(&(W[TWVL * 4]));
Chris@10 49 Tn = LDW(&(W[TWVL * 5]));
Chris@10 50 T5 = LDW(&(W[TWVL * 1]));
Chris@10 51 T6 = LDW(&(W[TWVL * 3]));
Chris@10 52 {
Chris@10 53 V T1, T1s, TK, T1r, Td, Tk, TG, TC, TY, Tu, TW, TL, TM, TO, TQ;
Chris@10 54 V Tx, Tz, TD, TH;
Chris@10 55 {
Chris@10 56 V T8, T4, Tm, Tr, Tc, Ta;
Chris@10 57 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@10 58 T1s = LD(&(ii[0]), ms, &(ii[0]));
Chris@10 59 T8 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@10 60 T4 = VMUL(T2, T3);
Chris@10 61 Tm = VMUL(T2, Tl);
Chris@10 62 Tr = VMUL(T2, Tn);
Chris@10 63 Tc = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@10 64 Ta = VMUL(T2, T6);
Chris@10 65 {
Chris@10 66 V Tp, Tt, Tg, T7, Tf, To, Ts, Ti, Tb, Tj;
Chris@10 67 Tp = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@10 68 Tt = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@10 69 Tg = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@10 70 T7 = VFNMS(T5, T6, T4);
Chris@10 71 Tf = VFMA(T5, T6, T4);
Chris@10 72 To = VFMA(T5, Tn, Tm);
Chris@10 73 Ts = VFNMS(T5, Tl, Tr);
Chris@10 74 Ti = VFNMS(T5, T3, Ta);
Chris@10 75 Tb = VFMA(T5, T3, Ta);
Chris@10 76 Tj = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@10 77 TK = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@10 78 {
Chris@10 79 V T1q, T9, Th, TF;
Chris@10 80 T1q = VMUL(T7, Tc);
Chris@10 81 T9 = VMUL(T7, T8);
Chris@10 82 Th = VMUL(Tf, Tg);
Chris@10 83 TF = VMUL(Tf, Tn);
Chris@10 84 {
Chris@10 85 V TB, TX, Tq, TV;
Chris@10 86 TB = VMUL(Tf, Tl);
Chris@10 87 TX = VMUL(To, Tt);
Chris@10 88 Tq = VMUL(To, Tp);
Chris@10 89 TV = VMUL(Tf, Tj);
Chris@10 90 T1r = VFNMS(Tb, T8, T1q);
Chris@10 91 Td = VFMA(Tb, Tc, T9);
Chris@10 92 Tk = VFMA(Ti, Tj, Th);
Chris@10 93 TG = VFNMS(Ti, Tl, TF);
Chris@10 94 TC = VFMA(Ti, Tn, TB);
Chris@10 95 TY = VFNMS(Ts, Tp, TX);
Chris@10 96 Tu = VFMA(Ts, Tt, Tq);
Chris@10 97 TW = VFNMS(Ti, Tg, TV);
Chris@10 98 TL = VMUL(Tl, TK);
Chris@10 99 }
Chris@10 100 }
Chris@10 101 TM = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@10 102 TO = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@10 103 TQ = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@10 104 Tx = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@10 105 Tz = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@10 106 TD = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@10 107 TH = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@10 108 }
Chris@10 109 }
Chris@10 110 {
Chris@10 111 V Te, T1p, T1g, T10, TS, T18, T1d, T1t, T1x, T1y, Tv, TJ, T11, T16;
Chris@10 112 {
Chris@10 113 V TN, T1a, TR, T1c, TA, T13, TI, T15;
Chris@10 114 {
Chris@10 115 V TU, T19, TP, T1b, Ty, T12, TE, T14, TZ;
Chris@10 116 TU = VSUB(T1, Td);
Chris@10 117 Te = VADD(T1, Td);
Chris@10 118 TN = VFMA(Tn, TM, TL);
Chris@10 119 T19 = VMUL(Tl, TM);
Chris@10 120 TP = VMUL(T3, TO);
Chris@10 121 T1b = VMUL(T3, TQ);
Chris@10 122 Ty = VMUL(T2, Tx);
Chris@10 123 T12 = VMUL(T2, Tz);
Chris@10 124 TE = VMUL(TC, TD);
Chris@10 125 T14 = VMUL(TC, TH);
Chris@10 126 T1p = VADD(TW, TY);
Chris@10 127 TZ = VSUB(TW, TY);
Chris@10 128 T1a = VFNMS(Tn, TK, T19);
Chris@10 129 TR = VFMA(T6, TQ, TP);
Chris@10 130 T1c = VFNMS(T6, TO, T1b);
Chris@10 131 TA = VFMA(T5, Tz, Ty);
Chris@10 132 T13 = VFNMS(T5, Tx, T12);
Chris@10 133 TI = VFMA(TG, TH, TE);
Chris@10 134 T15 = VFNMS(TG, TD, T14);
Chris@10 135 T1g = VSUB(TU, TZ);
Chris@10 136 T10 = VADD(TU, TZ);
Chris@10 137 }
Chris@10 138 TS = VADD(TN, TR);
Chris@10 139 T18 = VSUB(TN, TR);
Chris@10 140 T1d = VSUB(T1a, T1c);
Chris@10 141 T1m = VADD(T1a, T1c);
Chris@10 142 T1t = VADD(T1r, T1s);
Chris@10 143 T1x = VSUB(T1s, T1r);
Chris@10 144 T1y = VSUB(Tk, Tu);
Chris@10 145 Tv = VADD(Tk, Tu);
Chris@10 146 TJ = VADD(TA, TI);
Chris@10 147 T11 = VSUB(TA, TI);
Chris@10 148 T16 = VSUB(T13, T15);
Chris@10 149 T1l = VADD(T13, T15);
Chris@10 150 }
Chris@10 151 {
Chris@10 152 V Tw, T1w, T1v, TT;
Chris@10 153 {
Chris@10 154 V T1i, T1e, T1B, T1z, T1h, T17;
Chris@10 155 T1i = VADD(T18, T1d);
Chris@10 156 T1e = VSUB(T18, T1d);
Chris@10 157 T1B = VADD(T1y, T1x);
Chris@10 158 T1z = VSUB(T1x, T1y);
Chris@10 159 T1h = VSUB(T16, T11);
Chris@10 160 T17 = VADD(T11, T16);
Chris@10 161 T1k = VSUB(Te, Tv);
Chris@10 162 Tw = VADD(Te, Tv);
Chris@10 163 {
Chris@10 164 V T1A, T1j, T1C, T1f;
Chris@10 165 T1A = VADD(T1h, T1i);
Chris@10 166 T1j = VSUB(T1h, T1i);
Chris@10 167 T1C = VSUB(T1e, T17);
Chris@10 168 T1f = VADD(T17, T1e);
Chris@10 169 T1w = VSUB(T1t, T1p);
Chris@10 170 T1u = VADD(T1p, T1t);
Chris@10 171 T1v = VSUB(TS, TJ);
Chris@10 172 TT = VADD(TJ, TS);
Chris@10 173 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP707106781), T1A, T1z), ms, &(ii[WS(rs, 1)]));
Chris@10 174 ST(&(ii[WS(rs, 5)]), VFNMS(LDK(KP707106781), T1A, T1z), ms, &(ii[WS(rs, 1)]));
Chris@10 175 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP707106781), T1j, T1g), ms, &(ri[WS(rs, 1)]));
Chris@10 176 ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1j, T1g), ms, &(ri[WS(rs, 1)]));
Chris@10 177 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP707106781), T1C, T1B), ms, &(ii[WS(rs, 1)]));
Chris@10 178 ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1C, T1B), ms, &(ii[WS(rs, 1)]));
Chris@10 179 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP707106781), T1f, T10), ms, &(ri[WS(rs, 1)]));
Chris@10 180 ST(&(ri[WS(rs, 5)]), VFNMS(LDK(KP707106781), T1f, T10), ms, &(ri[WS(rs, 1)]));
Chris@10 181 }
Chris@10 182 }
Chris@10 183 ST(&(ri[WS(rs, 4)]), VSUB(Tw, TT), ms, &(ri[0]));
Chris@10 184 ST(&(ri[0]), VADD(Tw, TT), ms, &(ri[0]));
Chris@10 185 ST(&(ii[WS(rs, 6)]), VSUB(T1w, T1v), ms, &(ii[0]));
Chris@10 186 ST(&(ii[WS(rs, 2)]), VADD(T1v, T1w), ms, &(ii[0]));
Chris@10 187 }
Chris@10 188 }
Chris@10 189 }
Chris@10 190 }
Chris@10 191 T1n = VSUB(T1l, T1m);
Chris@10 192 T1o = VADD(T1l, T1m);
Chris@10 193 ST(&(ii[0]), VADD(T1o, T1u), ms, &(ii[0]));
Chris@10 194 ST(&(ii[WS(rs, 4)]), VSUB(T1u, T1o), ms, &(ii[0]));
Chris@10 195 ST(&(ri[WS(rs, 2)]), VADD(T1k, T1n), ms, &(ri[0]));
Chris@10 196 ST(&(ri[WS(rs, 6)]), VSUB(T1k, T1n), ms, &(ri[0]));
Chris@10 197 }
Chris@10 198 }
Chris@10 199 VLEAVE();
Chris@10 200 }
Chris@10 201
Chris@10 202 static const tw_instr twinstr[] = {
Chris@10 203 VTW(0, 1),
Chris@10 204 VTW(0, 3),
Chris@10 205 VTW(0, 7),
Chris@10 206 {TW_NEXT, (2 * VL), 0}
Chris@10 207 };
Chris@10 208
Chris@10 209 static const ct_desc desc = { 8, XSIMD_STRING("t2sv_8"), twinstr, &GENUS, {44, 20, 30, 0}, 0, 0, 0 };
Chris@10 210
Chris@10 211 void XSIMD(codelet_t2sv_8) (planner *p) {
Chris@10 212 X(kdft_dit_register) (p, t2sv_8, &desc);
Chris@10 213 }
Chris@10 214 #else /* HAVE_FMA */
Chris@10 215
Chris@10 216 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 8 -name t2sv_8 -include ts.h */
Chris@10 217
Chris@10 218 /*
Chris@10 219 * This function contains 74 FP additions, 44 FP multiplications,
Chris@10 220 * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
Chris@10 221 * 42 stack variables, 1 constants, and 32 memory accesses
Chris@10 222 */
Chris@10 223 #include "ts.h"
Chris@10 224
Chris@10 225 static void t2sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 226 {
Chris@10 227 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 228 {
Chris@10 229 INT m;
Chris@10 230 for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@10 231 V T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
Chris@10 232 {
Chris@10 233 V T4, Tb, T7, Ta;
Chris@10 234 T2 = LDW(&(W[0]));
Chris@10 235 T5 = LDW(&(W[TWVL * 1]));
Chris@10 236 T3 = LDW(&(W[TWVL * 2]));
Chris@10 237 T6 = LDW(&(W[TWVL * 3]));
Chris@10 238 T4 = VMUL(T2, T3);
Chris@10 239 Tb = VMUL(T5, T3);
Chris@10 240 T7 = VMUL(T5, T6);
Chris@10 241 Ta = VMUL(T2, T6);
Chris@10 242 T8 = VSUB(T4, T7);
Chris@10 243 Tc = VADD(Ta, Tb);
Chris@10 244 Tg = VADD(T4, T7);
Chris@10 245 Ti = VSUB(Ta, Tb);
Chris@10 246 Tl = LDW(&(W[TWVL * 4]));
Chris@10 247 Tm = LDW(&(W[TWVL * 5]));
Chris@10 248 Tn = VFMA(T2, Tl, VMUL(T5, Tm));
Chris@10 249 Tz = VFNMS(Ti, Tl, VMUL(Tg, Tm));
Chris@10 250 Tp = VFNMS(T5, Tl, VMUL(T2, Tm));
Chris@10 251 Tx = VFMA(Tg, Tl, VMUL(Ti, Tm));
Chris@10 252 }
Chris@10 253 {
Chris@10 254 V Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
Chris@10 255 V TT;
Chris@10 256 {
Chris@10 257 V T1, T1c, Te, T1b, T9, Td;
Chris@10 258 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@10 259 T1c = LD(&(ii[0]), ms, &(ii[0]));
Chris@10 260 T9 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@10 261 Td = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@10 262 Te = VFMA(T8, T9, VMUL(Tc, Td));
Chris@10 263 T1b = VFNMS(Tc, T9, VMUL(T8, Td));
Chris@10 264 Tf = VADD(T1, Te);
Chris@10 265 T1i = VSUB(T1c, T1b);
Chris@10 266 TL = VSUB(T1, Te);
Chris@10 267 T1d = VADD(T1b, T1c);
Chris@10 268 }
Chris@10 269 {
Chris@10 270 V TF, TW, TI, TX;
Chris@10 271 {
Chris@10 272 V TD, TE, TG, TH;
Chris@10 273 TD = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@10 274 TE = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@10 275 TF = VFMA(Tl, TD, VMUL(Tm, TE));
Chris@10 276 TW = VFNMS(Tm, TD, VMUL(Tl, TE));
Chris@10 277 TG = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@10 278 TH = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@10 279 TI = VFMA(T3, TG, VMUL(T6, TH));
Chris@10 280 TX = VFNMS(T6, TG, VMUL(T3, TH));
Chris@10 281 }
Chris@10 282 TJ = VADD(TF, TI);
Chris@10 283 T17 = VADD(TW, TX);
Chris@10 284 TV = VSUB(TF, TI);
Chris@10 285 TY = VSUB(TW, TX);
Chris@10 286 }
Chris@10 287 {
Chris@10 288 V Tk, TM, Tr, TN;
Chris@10 289 {
Chris@10 290 V Th, Tj, To, Tq;
Chris@10 291 Th = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@10 292 Tj = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@10 293 Tk = VFMA(Tg, Th, VMUL(Ti, Tj));
Chris@10 294 TM = VFNMS(Ti, Th, VMUL(Tg, Tj));
Chris@10 295 To = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@10 296 Tq = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@10 297 Tr = VFMA(Tn, To, VMUL(Tp, Tq));
Chris@10 298 TN = VFNMS(Tp, To, VMUL(Tn, Tq));
Chris@10 299 }
Chris@10 300 Ts = VADD(Tk, Tr);
Chris@10 301 T1j = VSUB(Tk, Tr);
Chris@10 302 TO = VSUB(TM, TN);
Chris@10 303 T1a = VADD(TM, TN);
Chris@10 304 }
Chris@10 305 {
Chris@10 306 V Tw, TR, TB, TS;
Chris@10 307 {
Chris@10 308 V Tu, Tv, Ty, TA;
Chris@10 309 Tu = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@10 310 Tv = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@10 311 Tw = VFMA(T2, Tu, VMUL(T5, Tv));
Chris@10 312 TR = VFNMS(T5, Tu, VMUL(T2, Tv));
Chris@10 313 Ty = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@10 314 TA = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@10 315 TB = VFMA(Tx, Ty, VMUL(Tz, TA));
Chris@10 316 TS = VFNMS(Tz, Ty, VMUL(Tx, TA));
Chris@10 317 }
Chris@10 318 TC = VADD(Tw, TB);
Chris@10 319 T16 = VADD(TR, TS);
Chris@10 320 TQ = VSUB(Tw, TB);
Chris@10 321 TT = VSUB(TR, TS);
Chris@10 322 }
Chris@10 323 {
Chris@10 324 V Tt, TK, T1f, T1g;
Chris@10 325 Tt = VADD(Tf, Ts);
Chris@10 326 TK = VADD(TC, TJ);
Chris@10 327 ST(&(ri[WS(rs, 4)]), VSUB(Tt, TK), ms, &(ri[0]));
Chris@10 328 ST(&(ri[0]), VADD(Tt, TK), ms, &(ri[0]));
Chris@10 329 {
Chris@10 330 V T19, T1e, T15, T18;
Chris@10 331 T19 = VADD(T16, T17);
Chris@10 332 T1e = VADD(T1a, T1d);
Chris@10 333 ST(&(ii[0]), VADD(T19, T1e), ms, &(ii[0]));
Chris@10 334 ST(&(ii[WS(rs, 4)]), VSUB(T1e, T19), ms, &(ii[0]));
Chris@10 335 T15 = VSUB(Tf, Ts);
Chris@10 336 T18 = VSUB(T16, T17);
Chris@10 337 ST(&(ri[WS(rs, 6)]), VSUB(T15, T18), ms, &(ri[0]));
Chris@10 338 ST(&(ri[WS(rs, 2)]), VADD(T15, T18), ms, &(ri[0]));
Chris@10 339 }
Chris@10 340 T1f = VSUB(TJ, TC);
Chris@10 341 T1g = VSUB(T1d, T1a);
Chris@10 342 ST(&(ii[WS(rs, 2)]), VADD(T1f, T1g), ms, &(ii[0]));
Chris@10 343 ST(&(ii[WS(rs, 6)]), VSUB(T1g, T1f), ms, &(ii[0]));
Chris@10 344 {
Chris@10 345 V T11, T1k, T14, T1h, T12, T13;
Chris@10 346 T11 = VSUB(TL, TO);
Chris@10 347 T1k = VSUB(T1i, T1j);
Chris@10 348 T12 = VSUB(TT, TQ);
Chris@10 349 T13 = VADD(TV, TY);
Chris@10 350 T14 = VMUL(LDK(KP707106781), VSUB(T12, T13));
Chris@10 351 T1h = VMUL(LDK(KP707106781), VADD(T12, T13));
Chris@10 352 ST(&(ri[WS(rs, 7)]), VSUB(T11, T14), ms, &(ri[WS(rs, 1)]));
Chris@10 353 ST(&(ii[WS(rs, 5)]), VSUB(T1k, T1h), ms, &(ii[WS(rs, 1)]));
Chris@10 354 ST(&(ri[WS(rs, 3)]), VADD(T11, T14), ms, &(ri[WS(rs, 1)]));
Chris@10 355 ST(&(ii[WS(rs, 1)]), VADD(T1h, T1k), ms, &(ii[WS(rs, 1)]));
Chris@10 356 }
Chris@10 357 {
Chris@10 358 V TP, T1m, T10, T1l, TU, TZ;
Chris@10 359 TP = VADD(TL, TO);
Chris@10 360 T1m = VADD(T1j, T1i);
Chris@10 361 TU = VADD(TQ, TT);
Chris@10 362 TZ = VSUB(TV, TY);
Chris@10 363 T10 = VMUL(LDK(KP707106781), VADD(TU, TZ));
Chris@10 364 T1l = VMUL(LDK(KP707106781), VSUB(TZ, TU));
Chris@10 365 ST(&(ri[WS(rs, 5)]), VSUB(TP, T10), ms, &(ri[WS(rs, 1)]));
Chris@10 366 ST(&(ii[WS(rs, 7)]), VSUB(T1m, T1l), ms, &(ii[WS(rs, 1)]));
Chris@10 367 ST(&(ri[WS(rs, 1)]), VADD(TP, T10), ms, &(ri[WS(rs, 1)]));
Chris@10 368 ST(&(ii[WS(rs, 3)]), VADD(T1l, T1m), ms, &(ii[WS(rs, 1)]));
Chris@10 369 }
Chris@10 370 }
Chris@10 371 }
Chris@10 372 }
Chris@10 373 }
Chris@10 374 VLEAVE();
Chris@10 375 }
Chris@10 376
Chris@10 377 static const tw_instr twinstr[] = {
Chris@10 378 VTW(0, 1),
Chris@10 379 VTW(0, 3),
Chris@10 380 VTW(0, 7),
Chris@10 381 {TW_NEXT, (2 * VL), 0}
Chris@10 382 };
Chris@10 383
Chris@10 384 static const ct_desc desc = { 8, XSIMD_STRING("t2sv_8"), twinstr, &GENUS, {56, 26, 18, 0}, 0, 0, 0 };
Chris@10 385
Chris@10 386 void XSIMD(codelet_t2sv_8) (planner *p) {
Chris@10 387 X(kdft_dit_register) (p, t2sv_8, &desc);
Chris@10 388 }
Chris@10 389 #endif /* HAVE_FMA */