annotate src/fftw-3.3.8/dft/simd/common/t2sv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:11 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 8 -name t2sv_8 -include dft/simd/ts.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 74 FP additions, 50 FP multiplications,
Chris@82 32 * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
Chris@82 33 * 48 stack variables, 1 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/ts.h"
Chris@82 36
Chris@82 37 static void t2sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 43 V T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG;
Chris@82 44 {
Chris@82 45 V T4, Tm, Tr, Ta, TB, TF;
Chris@82 46 T2 = LDW(&(W[0]));
Chris@82 47 T3 = LDW(&(W[TWVL * 2]));
Chris@82 48 T4 = VMUL(T2, T3);
Chris@82 49 Tl = LDW(&(W[TWVL * 4]));
Chris@82 50 Tm = VMUL(T2, Tl);
Chris@82 51 Tn = LDW(&(W[TWVL * 5]));
Chris@82 52 Tr = VMUL(T2, Tn);
Chris@82 53 T5 = LDW(&(W[TWVL * 1]));
Chris@82 54 T6 = LDW(&(W[TWVL * 3]));
Chris@82 55 Ta = VMUL(T2, T6);
Chris@82 56 Tf = VFMA(T5, T6, T4);
Chris@82 57 T7 = VFNMS(T5, T6, T4);
Chris@82 58 Ts = VFNMS(T5, Tl, Tr);
Chris@82 59 Tb = VFMA(T5, T3, Ta);
Chris@82 60 To = VFMA(T5, Tn, Tm);
Chris@82 61 TB = VMUL(Tf, Tl);
Chris@82 62 TF = VMUL(Tf, Tn);
Chris@82 63 Ti = VFNMS(T5, T3, Ta);
Chris@82 64 TC = VFMA(Ti, Tn, TB);
Chris@82 65 TG = VFNMS(Ti, Tl, TF);
Chris@82 66 }
Chris@82 67 {
Chris@82 68 V T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA;
Chris@82 69 V TI, T11, T13, T15, T16;
Chris@82 70 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 71 T1s = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 72 {
Chris@82 73 V T8, T9, Tc, T1q;
Chris@82 74 T8 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@82 75 T9 = VMUL(T7, T8);
Chris@82 76 Tc = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@82 77 T1q = VMUL(T7, Tc);
Chris@82 78 Td = VFMA(Tb, Tc, T9);
Chris@82 79 T1r = VFNMS(Tb, T8, T1q);
Chris@82 80 }
Chris@82 81 {
Chris@82 82 V Tp, Tq, Tt, TX;
Chris@82 83 Tp = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@82 84 Tq = VMUL(To, Tp);
Chris@82 85 Tt = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@82 86 TX = VMUL(To, Tt);
Chris@82 87 Tu = VFMA(Ts, Tt, Tq);
Chris@82 88 TY = VFNMS(Ts, Tp, TX);
Chris@82 89 }
Chris@82 90 {
Chris@82 91 V Tg, Th, Tj, TV;
Chris@82 92 Tg = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 93 Th = VMUL(Tf, Tg);
Chris@82 94 Tj = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 95 TV = VMUL(Tf, Tj);
Chris@82 96 Tk = VFMA(Ti, Tj, Th);
Chris@82 97 TW = VFNMS(Ti, Tg, TV);
Chris@82 98 }
Chris@82 99 {
Chris@82 100 V TK, TL, TM, T19, TO, TP, TQ, T1b;
Chris@82 101 TK = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@82 102 TL = VMUL(Tl, TK);
Chris@82 103 TM = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@82 104 T19 = VMUL(Tl, TM);
Chris@82 105 TO = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 106 TP = VMUL(T3, TO);
Chris@82 107 TQ = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 108 T1b = VMUL(T3, TQ);
Chris@82 109 TN = VFMA(Tn, TM, TL);
Chris@82 110 TR = VFMA(T6, TQ, TP);
Chris@82 111 T18 = VSUB(TN, TR);
Chris@82 112 T1a = VFNMS(Tn, TK, T19);
Chris@82 113 T1c = VFNMS(T6, TO, T1b);
Chris@82 114 T1d = VSUB(T1a, T1c);
Chris@82 115 }
Chris@82 116 {
Chris@82 117 V Tx, Ty, Tz, T12, TD, TE, TH, T14;
Chris@82 118 Tx = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 119 Ty = VMUL(T2, Tx);
Chris@82 120 Tz = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 121 T12 = VMUL(T2, Tz);
Chris@82 122 TD = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@82 123 TE = VMUL(TC, TD);
Chris@82 124 TH = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@82 125 T14 = VMUL(TC, TH);
Chris@82 126 TA = VFMA(T5, Tz, Ty);
Chris@82 127 TI = VFMA(TG, TH, TE);
Chris@82 128 T11 = VSUB(TA, TI);
Chris@82 129 T13 = VFNMS(T5, Tx, T12);
Chris@82 130 T15 = VFNMS(TG, TD, T14);
Chris@82 131 T16 = VSUB(T13, T15);
Chris@82 132 }
Chris@82 133 {
Chris@82 134 V T10, T1g, T1z, T1B, T1f, T1C, T1j, T1A;
Chris@82 135 {
Chris@82 136 V TU, TZ, T1x, T1y;
Chris@82 137 TU = VSUB(T1, Td);
Chris@82 138 TZ = VSUB(TW, TY);
Chris@82 139 T10 = VADD(TU, TZ);
Chris@82 140 T1g = VSUB(TU, TZ);
Chris@82 141 T1x = VSUB(T1s, T1r);
Chris@82 142 T1y = VSUB(Tk, Tu);
Chris@82 143 T1z = VSUB(T1x, T1y);
Chris@82 144 T1B = VADD(T1y, T1x);
Chris@82 145 }
Chris@82 146 {
Chris@82 147 V T17, T1e, T1h, T1i;
Chris@82 148 T17 = VADD(T11, T16);
Chris@82 149 T1e = VSUB(T18, T1d);
Chris@82 150 T1f = VADD(T17, T1e);
Chris@82 151 T1C = VSUB(T1e, T17);
Chris@82 152 T1h = VSUB(T16, T11);
Chris@82 153 T1i = VADD(T18, T1d);
Chris@82 154 T1j = VSUB(T1h, T1i);
Chris@82 155 T1A = VADD(T1h, T1i);
Chris@82 156 }
Chris@82 157 ST(&(ri[WS(rs, 5)]), VFNMS(LDK(KP707106781), T1f, T10), ms, &(ri[WS(rs, 1)]));
Chris@82 158 ST(&(ii[WS(rs, 5)]), VFNMS(LDK(KP707106781), T1A, T1z), ms, &(ii[WS(rs, 1)]));
Chris@82 159 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP707106781), T1f, T10), ms, &(ri[WS(rs, 1)]));
Chris@82 160 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP707106781), T1A, T1z), ms, &(ii[WS(rs, 1)]));
Chris@82 161 ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1j, T1g), ms, &(ri[WS(rs, 1)]));
Chris@82 162 ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1C, T1B), ms, &(ii[WS(rs, 1)]));
Chris@82 163 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP707106781), T1j, T1g), ms, &(ri[WS(rs, 1)]));
Chris@82 164 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP707106781), T1C, T1B), ms, &(ii[WS(rs, 1)]));
Chris@82 165 }
Chris@82 166 {
Chris@82 167 V Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o;
Chris@82 168 {
Chris@82 169 V Te, Tv, T1p, T1t;
Chris@82 170 Te = VADD(T1, Td);
Chris@82 171 Tv = VADD(Tk, Tu);
Chris@82 172 Tw = VADD(Te, Tv);
Chris@82 173 T1k = VSUB(Te, Tv);
Chris@82 174 T1p = VADD(TW, TY);
Chris@82 175 T1t = VADD(T1r, T1s);
Chris@82 176 T1u = VADD(T1p, T1t);
Chris@82 177 T1w = VSUB(T1t, T1p);
Chris@82 178 }
Chris@82 179 {
Chris@82 180 V TJ, TS, T1l, T1m;
Chris@82 181 TJ = VADD(TA, TI);
Chris@82 182 TS = VADD(TN, TR);
Chris@82 183 TT = VADD(TJ, TS);
Chris@82 184 T1v = VSUB(TS, TJ);
Chris@82 185 T1l = VADD(T13, T15);
Chris@82 186 T1m = VADD(T1a, T1c);
Chris@82 187 T1n = VSUB(T1l, T1m);
Chris@82 188 T1o = VADD(T1l, T1m);
Chris@82 189 }
Chris@82 190 ST(&(ri[WS(rs, 4)]), VSUB(Tw, TT), ms, &(ri[0]));
Chris@82 191 ST(&(ii[WS(rs, 4)]), VSUB(T1u, T1o), ms, &(ii[0]));
Chris@82 192 ST(&(ri[0]), VADD(Tw, TT), ms, &(ri[0]));
Chris@82 193 ST(&(ii[0]), VADD(T1o, T1u), ms, &(ii[0]));
Chris@82 194 ST(&(ri[WS(rs, 6)]), VSUB(T1k, T1n), ms, &(ri[0]));
Chris@82 195 ST(&(ii[WS(rs, 6)]), VSUB(T1w, T1v), ms, &(ii[0]));
Chris@82 196 ST(&(ri[WS(rs, 2)]), VADD(T1k, T1n), ms, &(ri[0]));
Chris@82 197 ST(&(ii[WS(rs, 2)]), VADD(T1v, T1w), ms, &(ii[0]));
Chris@82 198 }
Chris@82 199 }
Chris@82 200 }
Chris@82 201 }
Chris@82 202 VLEAVE();
Chris@82 203 }
Chris@82 204
Chris@82 205 static const tw_instr twinstr[] = {
Chris@82 206 VTW(0, 1),
Chris@82 207 VTW(0, 3),
Chris@82 208 VTW(0, 7),
Chris@82 209 {TW_NEXT, (2 * VL), 0}
Chris@82 210 };
Chris@82 211
Chris@82 212 static const ct_desc desc = { 8, XSIMD_STRING("t2sv_8"), twinstr, &GENUS, {44, 20, 30, 0}, 0, 0, 0 };
Chris@82 213
Chris@82 214 void XSIMD(codelet_t2sv_8) (planner *p) {
Chris@82 215 X(kdft_dit_register) (p, t2sv_8, &desc);
Chris@82 216 }
Chris@82 217 #else
Chris@82 218
Chris@82 219 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 8 -name t2sv_8 -include dft/simd/ts.h */
Chris@82 220
Chris@82 221 /*
Chris@82 222 * This function contains 74 FP additions, 44 FP multiplications,
Chris@82 223 * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
Chris@82 224 * 42 stack variables, 1 constants, and 32 memory accesses
Chris@82 225 */
Chris@82 226 #include "dft/simd/ts.h"
Chris@82 227
Chris@82 228 static void t2sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 229 {
Chris@82 230 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 231 {
Chris@82 232 INT m;
Chris@82 233 for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 234 V T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
Chris@82 235 {
Chris@82 236 V T4, Tb, T7, Ta;
Chris@82 237 T2 = LDW(&(W[0]));
Chris@82 238 T5 = LDW(&(W[TWVL * 1]));
Chris@82 239 T3 = LDW(&(W[TWVL * 2]));
Chris@82 240 T6 = LDW(&(W[TWVL * 3]));
Chris@82 241 T4 = VMUL(T2, T3);
Chris@82 242 Tb = VMUL(T5, T3);
Chris@82 243 T7 = VMUL(T5, T6);
Chris@82 244 Ta = VMUL(T2, T6);
Chris@82 245 T8 = VSUB(T4, T7);
Chris@82 246 Tc = VADD(Ta, Tb);
Chris@82 247 Tg = VADD(T4, T7);
Chris@82 248 Ti = VSUB(Ta, Tb);
Chris@82 249 Tl = LDW(&(W[TWVL * 4]));
Chris@82 250 Tm = LDW(&(W[TWVL * 5]));
Chris@82 251 Tn = VFMA(T2, Tl, VMUL(T5, Tm));
Chris@82 252 Tz = VFNMS(Ti, Tl, VMUL(Tg, Tm));
Chris@82 253 Tp = VFNMS(T5, Tl, VMUL(T2, Tm));
Chris@82 254 Tx = VFMA(Tg, Tl, VMUL(Ti, Tm));
Chris@82 255 }
Chris@82 256 {
Chris@82 257 V Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
Chris@82 258 V TT;
Chris@82 259 {
Chris@82 260 V T1, T1c, Te, T1b, T9, Td;
Chris@82 261 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 262 T1c = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 263 T9 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@82 264 Td = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@82 265 Te = VFMA(T8, T9, VMUL(Tc, Td));
Chris@82 266 T1b = VFNMS(Tc, T9, VMUL(T8, Td));
Chris@82 267 Tf = VADD(T1, Te);
Chris@82 268 T1i = VSUB(T1c, T1b);
Chris@82 269 TL = VSUB(T1, Te);
Chris@82 270 T1d = VADD(T1b, T1c);
Chris@82 271 }
Chris@82 272 {
Chris@82 273 V TF, TW, TI, TX;
Chris@82 274 {
Chris@82 275 V TD, TE, TG, TH;
Chris@82 276 TD = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@82 277 TE = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@82 278 TF = VFMA(Tl, TD, VMUL(Tm, TE));
Chris@82 279 TW = VFNMS(Tm, TD, VMUL(Tl, TE));
Chris@82 280 TG = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 281 TH = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 282 TI = VFMA(T3, TG, VMUL(T6, TH));
Chris@82 283 TX = VFNMS(T6, TG, VMUL(T3, TH));
Chris@82 284 }
Chris@82 285 TJ = VADD(TF, TI);
Chris@82 286 T17 = VADD(TW, TX);
Chris@82 287 TV = VSUB(TF, TI);
Chris@82 288 TY = VSUB(TW, TX);
Chris@82 289 }
Chris@82 290 {
Chris@82 291 V Tk, TM, Tr, TN;
Chris@82 292 {
Chris@82 293 V Th, Tj, To, Tq;
Chris@82 294 Th = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 295 Tj = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 296 Tk = VFMA(Tg, Th, VMUL(Ti, Tj));
Chris@82 297 TM = VFNMS(Ti, Th, VMUL(Tg, Tj));
Chris@82 298 To = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@82 299 Tq = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@82 300 Tr = VFMA(Tn, To, VMUL(Tp, Tq));
Chris@82 301 TN = VFNMS(Tp, To, VMUL(Tn, Tq));
Chris@82 302 }
Chris@82 303 Ts = VADD(Tk, Tr);
Chris@82 304 T1j = VSUB(Tk, Tr);
Chris@82 305 TO = VSUB(TM, TN);
Chris@82 306 T1a = VADD(TM, TN);
Chris@82 307 }
Chris@82 308 {
Chris@82 309 V Tw, TR, TB, TS;
Chris@82 310 {
Chris@82 311 V Tu, Tv, Ty, TA;
Chris@82 312 Tu = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 313 Tv = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 314 Tw = VFMA(T2, Tu, VMUL(T5, Tv));
Chris@82 315 TR = VFNMS(T5, Tu, VMUL(T2, Tv));
Chris@82 316 Ty = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@82 317 TA = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@82 318 TB = VFMA(Tx, Ty, VMUL(Tz, TA));
Chris@82 319 TS = VFNMS(Tz, Ty, VMUL(Tx, TA));
Chris@82 320 }
Chris@82 321 TC = VADD(Tw, TB);
Chris@82 322 T16 = VADD(TR, TS);
Chris@82 323 TQ = VSUB(Tw, TB);
Chris@82 324 TT = VSUB(TR, TS);
Chris@82 325 }
Chris@82 326 {
Chris@82 327 V Tt, TK, T1f, T1g;
Chris@82 328 Tt = VADD(Tf, Ts);
Chris@82 329 TK = VADD(TC, TJ);
Chris@82 330 ST(&(ri[WS(rs, 4)]), VSUB(Tt, TK), ms, &(ri[0]));
Chris@82 331 ST(&(ri[0]), VADD(Tt, TK), ms, &(ri[0]));
Chris@82 332 {
Chris@82 333 V T19, T1e, T15, T18;
Chris@82 334 T19 = VADD(T16, T17);
Chris@82 335 T1e = VADD(T1a, T1d);
Chris@82 336 ST(&(ii[0]), VADD(T19, T1e), ms, &(ii[0]));
Chris@82 337 ST(&(ii[WS(rs, 4)]), VSUB(T1e, T19), ms, &(ii[0]));
Chris@82 338 T15 = VSUB(Tf, Ts);
Chris@82 339 T18 = VSUB(T16, T17);
Chris@82 340 ST(&(ri[WS(rs, 6)]), VSUB(T15, T18), ms, &(ri[0]));
Chris@82 341 ST(&(ri[WS(rs, 2)]), VADD(T15, T18), ms, &(ri[0]));
Chris@82 342 }
Chris@82 343 T1f = VSUB(TJ, TC);
Chris@82 344 T1g = VSUB(T1d, T1a);
Chris@82 345 ST(&(ii[WS(rs, 2)]), VADD(T1f, T1g), ms, &(ii[0]));
Chris@82 346 ST(&(ii[WS(rs, 6)]), VSUB(T1g, T1f), ms, &(ii[0]));
Chris@82 347 {
Chris@82 348 V T11, T1k, T14, T1h, T12, T13;
Chris@82 349 T11 = VSUB(TL, TO);
Chris@82 350 T1k = VSUB(T1i, T1j);
Chris@82 351 T12 = VSUB(TT, TQ);
Chris@82 352 T13 = VADD(TV, TY);
Chris@82 353 T14 = VMUL(LDK(KP707106781), VSUB(T12, T13));
Chris@82 354 T1h = VMUL(LDK(KP707106781), VADD(T12, T13));
Chris@82 355 ST(&(ri[WS(rs, 7)]), VSUB(T11, T14), ms, &(ri[WS(rs, 1)]));
Chris@82 356 ST(&(ii[WS(rs, 5)]), VSUB(T1k, T1h), ms, &(ii[WS(rs, 1)]));
Chris@82 357 ST(&(ri[WS(rs, 3)]), VADD(T11, T14), ms, &(ri[WS(rs, 1)]));
Chris@82 358 ST(&(ii[WS(rs, 1)]), VADD(T1h, T1k), ms, &(ii[WS(rs, 1)]));
Chris@82 359 }
Chris@82 360 {
Chris@82 361 V TP, T1m, T10, T1l, TU, TZ;
Chris@82 362 TP = VADD(TL, TO);
Chris@82 363 T1m = VADD(T1j, T1i);
Chris@82 364 TU = VADD(TQ, TT);
Chris@82 365 TZ = VSUB(TV, TY);
Chris@82 366 T10 = VMUL(LDK(KP707106781), VADD(TU, TZ));
Chris@82 367 T1l = VMUL(LDK(KP707106781), VSUB(TZ, TU));
Chris@82 368 ST(&(ri[WS(rs, 5)]), VSUB(TP, T10), ms, &(ri[WS(rs, 1)]));
Chris@82 369 ST(&(ii[WS(rs, 7)]), VSUB(T1m, T1l), ms, &(ii[WS(rs, 1)]));
Chris@82 370 ST(&(ri[WS(rs, 1)]), VADD(TP, T10), ms, &(ri[WS(rs, 1)]));
Chris@82 371 ST(&(ii[WS(rs, 3)]), VADD(T1l, T1m), ms, &(ii[WS(rs, 1)]));
Chris@82 372 }
Chris@82 373 }
Chris@82 374 }
Chris@82 375 }
Chris@82 376 }
Chris@82 377 VLEAVE();
Chris@82 378 }
Chris@82 379
Chris@82 380 static const tw_instr twinstr[] = {
Chris@82 381 VTW(0, 1),
Chris@82 382 VTW(0, 3),
Chris@82 383 VTW(0, 7),
Chris@82 384 {TW_NEXT, (2 * VL), 0}
Chris@82 385 };
Chris@82 386
Chris@82 387 static const ct_desc desc = { 8, XSIMD_STRING("t2sv_8"), twinstr, &GENUS, {56, 26, 18, 0}, 0, 0, 0 };
Chris@82 388
Chris@82 389 void XSIMD(codelet_t2sv_8) (planner *p) {
Chris@82 390 X(kdft_dit_register) (p, t2sv_8, &desc);
Chris@82 391 }
Chris@82 392 #endif