annotate src/fftw-3.3.8/dft/simd/common/t1sv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:09 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1sv_8 -include dft/simd/ts.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@82 32 * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
Chris@82 33 * 34 stack variables, 1 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/ts.h"
Chris@82 36
Chris@82 37 static void t1sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 for (m = mb, W = W + (mb * 14); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 14), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 43 V T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts;
Chris@82 44 V TX, Ty, TZ, TV, T10;
Chris@82 45 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 46 T1m = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 47 {
Chris@82 48 V T3, T6, T4, T1k, T2, T5;
Chris@82 49 T3 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@82 50 T6 = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@82 51 T2 = LDW(&(W[TWVL * 6]));
Chris@82 52 T4 = VMUL(T2, T3);
Chris@82 53 T1k = VMUL(T2, T6);
Chris@82 54 T5 = LDW(&(W[TWVL * 7]));
Chris@82 55 T7 = VFMA(T5, T6, T4);
Chris@82 56 T1l = VFNMS(T5, T3, T1k);
Chris@82 57 }
Chris@82 58 {
Chris@82 59 V Tg, Tj, Th, TR, Tf, Ti;
Chris@82 60 Tg = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@82 61 Tj = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@82 62 Tf = LDW(&(W[TWVL * 10]));
Chris@82 63 Th = VMUL(Tf, Tg);
Chris@82 64 TR = VMUL(Tf, Tj);
Chris@82 65 Ti = LDW(&(W[TWVL * 11]));
Chris@82 66 Tk = VFMA(Ti, Tj, Th);
Chris@82 67 TS = VFNMS(Ti, Tg, TR);
Chris@82 68 }
Chris@82 69 {
Chris@82 70 V Ta, Td, Tb, TP, T9, Tc;
Chris@82 71 Ta = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 72 Td = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 73 T9 = LDW(&(W[TWVL * 2]));
Chris@82 74 Tb = VMUL(T9, Ta);
Chris@82 75 TP = VMUL(T9, Td);
Chris@82 76 Tc = LDW(&(W[TWVL * 3]));
Chris@82 77 Te = VFMA(Tc, Td, Tb);
Chris@82 78 TQ = VFNMS(Tc, Ta, TP);
Chris@82 79 }
Chris@82 80 {
Chris@82 81 V TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ;
Chris@82 82 TB = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@82 83 TE = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@82 84 TA = LDW(&(W[TWVL * 12]));
Chris@82 85 TC = VMUL(TA, TB);
Chris@82 86 T13 = VMUL(TA, TE);
Chris@82 87 TH = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 88 TK = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 89 TG = LDW(&(W[TWVL * 4]));
Chris@82 90 TI = VMUL(TG, TH);
Chris@82 91 T15 = VMUL(TG, TK);
Chris@82 92 TD = LDW(&(W[TWVL * 13]));
Chris@82 93 TF = VFMA(TD, TE, TC);
Chris@82 94 T14 = VFNMS(TD, TB, T13);
Chris@82 95 TJ = LDW(&(W[TWVL * 5]));
Chris@82 96 TL = VFMA(TJ, TK, TI);
Chris@82 97 T16 = VFNMS(TJ, TH, T15);
Chris@82 98 T12 = VSUB(TF, TL);
Chris@82 99 T17 = VSUB(T14, T16);
Chris@82 100 }
Chris@82 101 {
Chris@82 102 V To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw;
Chris@82 103 To = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 104 Tr = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 105 Tn = LDW(&(W[0]));
Chris@82 106 Tp = VMUL(Tn, To);
Chris@82 107 TW = VMUL(Tn, Tr);
Chris@82 108 Tu = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@82 109 Tx = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@82 110 Tt = LDW(&(W[TWVL * 8]));
Chris@82 111 Tv = VMUL(Tt, Tu);
Chris@82 112 TY = VMUL(Tt, Tx);
Chris@82 113 Tq = LDW(&(W[TWVL * 1]));
Chris@82 114 Ts = VFMA(Tq, Tr, Tp);
Chris@82 115 TX = VFNMS(Tq, To, TW);
Chris@82 116 Tw = LDW(&(W[TWVL * 9]));
Chris@82 117 Ty = VFMA(Tw, Tx, Tv);
Chris@82 118 TZ = VFNMS(Tw, Tu, TY);
Chris@82 119 TV = VSUB(Ts, Ty);
Chris@82 120 T10 = VSUB(TX, TZ);
Chris@82 121 }
Chris@82 122 {
Chris@82 123 V TU, T1a, T1t, T1v, T19, T1w, T1d, T1u;
Chris@82 124 {
Chris@82 125 V TO, TT, T1r, T1s;
Chris@82 126 TO = VSUB(T1, T7);
Chris@82 127 TT = VSUB(TQ, TS);
Chris@82 128 TU = VADD(TO, TT);
Chris@82 129 T1a = VSUB(TO, TT);
Chris@82 130 T1r = VSUB(T1m, T1l);
Chris@82 131 T1s = VSUB(Te, Tk);
Chris@82 132 T1t = VSUB(T1r, T1s);
Chris@82 133 T1v = VADD(T1s, T1r);
Chris@82 134 }
Chris@82 135 {
Chris@82 136 V T11, T18, T1b, T1c;
Chris@82 137 T11 = VADD(TV, T10);
Chris@82 138 T18 = VSUB(T12, T17);
Chris@82 139 T19 = VADD(T11, T18);
Chris@82 140 T1w = VSUB(T18, T11);
Chris@82 141 T1b = VSUB(T10, TV);
Chris@82 142 T1c = VADD(T12, T17);
Chris@82 143 T1d = VSUB(T1b, T1c);
Chris@82 144 T1u = VADD(T1b, T1c);
Chris@82 145 }
Chris@82 146 ST(&(ri[WS(rs, 5)]), VFNMS(LDK(KP707106781), T19, TU), ms, &(ri[WS(rs, 1)]));
Chris@82 147 ST(&(ii[WS(rs, 5)]), VFNMS(LDK(KP707106781), T1u, T1t), ms, &(ii[WS(rs, 1)]));
Chris@82 148 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP707106781), T19, TU), ms, &(ri[WS(rs, 1)]));
Chris@82 149 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP707106781), T1u, T1t), ms, &(ii[WS(rs, 1)]));
Chris@82 150 ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1d, T1a), ms, &(ri[WS(rs, 1)]));
Chris@82 151 ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP707106781), T1w, T1v), ms, &(ii[WS(rs, 1)]));
Chris@82 152 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP707106781), T1d, T1a), ms, &(ri[WS(rs, 1)]));
Chris@82 153 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP707106781), T1w, T1v), ms, &(ii[WS(rs, 1)]));
Chris@82 154 }
Chris@82 155 {
Chris@82 156 V Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i;
Chris@82 157 {
Chris@82 158 V T8, Tl, T1j, T1n;
Chris@82 159 T8 = VADD(T1, T7);
Chris@82 160 Tl = VADD(Te, Tk);
Chris@82 161 Tm = VADD(T8, Tl);
Chris@82 162 T1e = VSUB(T8, Tl);
Chris@82 163 T1j = VADD(TQ, TS);
Chris@82 164 T1n = VADD(T1l, T1m);
Chris@82 165 T1o = VADD(T1j, T1n);
Chris@82 166 T1q = VSUB(T1n, T1j);
Chris@82 167 }
Chris@82 168 {
Chris@82 169 V Tz, TM, T1f, T1g;
Chris@82 170 Tz = VADD(Ts, Ty);
Chris@82 171 TM = VADD(TF, TL);
Chris@82 172 TN = VADD(Tz, TM);
Chris@82 173 T1p = VSUB(TM, Tz);
Chris@82 174 T1f = VADD(TX, TZ);
Chris@82 175 T1g = VADD(T14, T16);
Chris@82 176 T1h = VSUB(T1f, T1g);
Chris@82 177 T1i = VADD(T1f, T1g);
Chris@82 178 }
Chris@82 179 ST(&(ri[WS(rs, 4)]), VSUB(Tm, TN), ms, &(ri[0]));
Chris@82 180 ST(&(ii[WS(rs, 4)]), VSUB(T1o, T1i), ms, &(ii[0]));
Chris@82 181 ST(&(ri[0]), VADD(Tm, TN), ms, &(ri[0]));
Chris@82 182 ST(&(ii[0]), VADD(T1i, T1o), ms, &(ii[0]));
Chris@82 183 ST(&(ri[WS(rs, 6)]), VSUB(T1e, T1h), ms, &(ri[0]));
Chris@82 184 ST(&(ii[WS(rs, 6)]), VSUB(T1q, T1p), ms, &(ii[0]));
Chris@82 185 ST(&(ri[WS(rs, 2)]), VADD(T1e, T1h), ms, &(ri[0]));
Chris@82 186 ST(&(ii[WS(rs, 2)]), VADD(T1p, T1q), ms, &(ii[0]));
Chris@82 187 }
Chris@82 188 }
Chris@82 189 }
Chris@82 190 VLEAVE();
Chris@82 191 }
Chris@82 192
Chris@82 193 static const tw_instr twinstr[] = {
Chris@82 194 VTW(0, 1),
Chris@82 195 VTW(0, 2),
Chris@82 196 VTW(0, 3),
Chris@82 197 VTW(0, 4),
Chris@82 198 VTW(0, 5),
Chris@82 199 VTW(0, 6),
Chris@82 200 VTW(0, 7),
Chris@82 201 {TW_NEXT, (2 * VL), 0}
Chris@82 202 };
Chris@82 203
Chris@82 204 static const ct_desc desc = { 8, XSIMD_STRING("t1sv_8"), twinstr, &GENUS, {44, 14, 22, 0}, 0, 0, 0 };
Chris@82 205
Chris@82 206 void XSIMD(codelet_t1sv_8) (planner *p) {
Chris@82 207 X(kdft_dit_register) (p, t1sv_8, &desc);
Chris@82 208 }
Chris@82 209 #else
Chris@82 210
Chris@82 211 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1sv_8 -include dft/simd/ts.h */
Chris@82 212
Chris@82 213 /*
Chris@82 214 * This function contains 66 FP additions, 32 FP multiplications,
Chris@82 215 * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
Chris@82 216 * 28 stack variables, 1 constants, and 32 memory accesses
Chris@82 217 */
Chris@82 218 #include "dft/simd/ts.h"
Chris@82 219
Chris@82 220 static void t1sv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 221 {
Chris@82 222 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 223 {
Chris@82 224 INT m;
Chris@82 225 for (m = mb, W = W + (mb * 14); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 14), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 226 V T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
Chris@82 227 V TP;
Chris@82 228 {
Chris@82 229 V T1, T18, T6, T17;
Chris@82 230 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 231 T18 = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 232 {
Chris@82 233 V T3, T5, T2, T4;
Chris@82 234 T3 = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@82 235 T5 = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@82 236 T2 = LDW(&(W[TWVL * 6]));
Chris@82 237 T4 = LDW(&(W[TWVL * 7]));
Chris@82 238 T6 = VFMA(T2, T3, VMUL(T4, T5));
Chris@82 239 T17 = VFNMS(T4, T3, VMUL(T2, T5));
Chris@82 240 }
Chris@82 241 T7 = VADD(T1, T6);
Chris@82 242 T1e = VSUB(T18, T17);
Chris@82 243 TH = VSUB(T1, T6);
Chris@82 244 T19 = VADD(T17, T18);
Chris@82 245 }
Chris@82 246 {
Chris@82 247 V Tz, TS, TE, TT;
Chris@82 248 {
Chris@82 249 V Tw, Ty, Tv, Tx;
Chris@82 250 Tw = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@82 251 Ty = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@82 252 Tv = LDW(&(W[TWVL * 12]));
Chris@82 253 Tx = LDW(&(W[TWVL * 13]));
Chris@82 254 Tz = VFMA(Tv, Tw, VMUL(Tx, Ty));
Chris@82 255 TS = VFNMS(Tx, Tw, VMUL(Tv, Ty));
Chris@82 256 }
Chris@82 257 {
Chris@82 258 V TB, TD, TA, TC;
Chris@82 259 TB = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 260 TD = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 261 TA = LDW(&(W[TWVL * 4]));
Chris@82 262 TC = LDW(&(W[TWVL * 5]));
Chris@82 263 TE = VFMA(TA, TB, VMUL(TC, TD));
Chris@82 264 TT = VFNMS(TC, TB, VMUL(TA, TD));
Chris@82 265 }
Chris@82 266 TF = VADD(Tz, TE);
Chris@82 267 T13 = VADD(TS, TT);
Chris@82 268 TR = VSUB(Tz, TE);
Chris@82 269 TU = VSUB(TS, TT);
Chris@82 270 }
Chris@82 271 {
Chris@82 272 V Tc, TI, Th, TJ;
Chris@82 273 {
Chris@82 274 V T9, Tb, T8, Ta;
Chris@82 275 T9 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 276 Tb = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 277 T8 = LDW(&(W[TWVL * 2]));
Chris@82 278 Ta = LDW(&(W[TWVL * 3]));
Chris@82 279 Tc = VFMA(T8, T9, VMUL(Ta, Tb));
Chris@82 280 TI = VFNMS(Ta, T9, VMUL(T8, Tb));
Chris@82 281 }
Chris@82 282 {
Chris@82 283 V Te, Tg, Td, Tf;
Chris@82 284 Te = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@82 285 Tg = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@82 286 Td = LDW(&(W[TWVL * 10]));
Chris@82 287 Tf = LDW(&(W[TWVL * 11]));
Chris@82 288 Th = VFMA(Td, Te, VMUL(Tf, Tg));
Chris@82 289 TJ = VFNMS(Tf, Te, VMUL(Td, Tg));
Chris@82 290 }
Chris@82 291 Ti = VADD(Tc, Th);
Chris@82 292 T1f = VSUB(Tc, Th);
Chris@82 293 TK = VSUB(TI, TJ);
Chris@82 294 T16 = VADD(TI, TJ);
Chris@82 295 }
Chris@82 296 {
Chris@82 297 V To, TN, Tt, TO;
Chris@82 298 {
Chris@82 299 V Tl, Tn, Tk, Tm;
Chris@82 300 Tl = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 301 Tn = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 302 Tk = LDW(&(W[0]));
Chris@82 303 Tm = LDW(&(W[TWVL * 1]));
Chris@82 304 To = VFMA(Tk, Tl, VMUL(Tm, Tn));
Chris@82 305 TN = VFNMS(Tm, Tl, VMUL(Tk, Tn));
Chris@82 306 }
Chris@82 307 {
Chris@82 308 V Tq, Ts, Tp, Tr;
Chris@82 309 Tq = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@82 310 Ts = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@82 311 Tp = LDW(&(W[TWVL * 8]));
Chris@82 312 Tr = LDW(&(W[TWVL * 9]));
Chris@82 313 Tt = VFMA(Tp, Tq, VMUL(Tr, Ts));
Chris@82 314 TO = VFNMS(Tr, Tq, VMUL(Tp, Ts));
Chris@82 315 }
Chris@82 316 Tu = VADD(To, Tt);
Chris@82 317 T12 = VADD(TN, TO);
Chris@82 318 TM = VSUB(To, Tt);
Chris@82 319 TP = VSUB(TN, TO);
Chris@82 320 }
Chris@82 321 {
Chris@82 322 V Tj, TG, T1b, T1c;
Chris@82 323 Tj = VADD(T7, Ti);
Chris@82 324 TG = VADD(Tu, TF);
Chris@82 325 ST(&(ri[WS(rs, 4)]), VSUB(Tj, TG), ms, &(ri[0]));
Chris@82 326 ST(&(ri[0]), VADD(Tj, TG), ms, &(ri[0]));
Chris@82 327 {
Chris@82 328 V T15, T1a, T11, T14;
Chris@82 329 T15 = VADD(T12, T13);
Chris@82 330 T1a = VADD(T16, T19);
Chris@82 331 ST(&(ii[0]), VADD(T15, T1a), ms, &(ii[0]));
Chris@82 332 ST(&(ii[WS(rs, 4)]), VSUB(T1a, T15), ms, &(ii[0]));
Chris@82 333 T11 = VSUB(T7, Ti);
Chris@82 334 T14 = VSUB(T12, T13);
Chris@82 335 ST(&(ri[WS(rs, 6)]), VSUB(T11, T14), ms, &(ri[0]));
Chris@82 336 ST(&(ri[WS(rs, 2)]), VADD(T11, T14), ms, &(ri[0]));
Chris@82 337 }
Chris@82 338 T1b = VSUB(TF, Tu);
Chris@82 339 T1c = VSUB(T19, T16);
Chris@82 340 ST(&(ii[WS(rs, 2)]), VADD(T1b, T1c), ms, &(ii[0]));
Chris@82 341 ST(&(ii[WS(rs, 6)]), VSUB(T1c, T1b), ms, &(ii[0]));
Chris@82 342 {
Chris@82 343 V TX, T1g, T10, T1d, TY, TZ;
Chris@82 344 TX = VSUB(TH, TK);
Chris@82 345 T1g = VSUB(T1e, T1f);
Chris@82 346 TY = VSUB(TP, TM);
Chris@82 347 TZ = VADD(TR, TU);
Chris@82 348 T10 = VMUL(LDK(KP707106781), VSUB(TY, TZ));
Chris@82 349 T1d = VMUL(LDK(KP707106781), VADD(TY, TZ));
Chris@82 350 ST(&(ri[WS(rs, 7)]), VSUB(TX, T10), ms, &(ri[WS(rs, 1)]));
Chris@82 351 ST(&(ii[WS(rs, 5)]), VSUB(T1g, T1d), ms, &(ii[WS(rs, 1)]));
Chris@82 352 ST(&(ri[WS(rs, 3)]), VADD(TX, T10), ms, &(ri[WS(rs, 1)]));
Chris@82 353 ST(&(ii[WS(rs, 1)]), VADD(T1d, T1g), ms, &(ii[WS(rs, 1)]));
Chris@82 354 }
Chris@82 355 {
Chris@82 356 V TL, T1i, TW, T1h, TQ, TV;
Chris@82 357 TL = VADD(TH, TK);
Chris@82 358 T1i = VADD(T1f, T1e);
Chris@82 359 TQ = VADD(TM, TP);
Chris@82 360 TV = VSUB(TR, TU);
Chris@82 361 TW = VMUL(LDK(KP707106781), VADD(TQ, TV));
Chris@82 362 T1h = VMUL(LDK(KP707106781), VSUB(TV, TQ));
Chris@82 363 ST(&(ri[WS(rs, 5)]), VSUB(TL, TW), ms, &(ri[WS(rs, 1)]));
Chris@82 364 ST(&(ii[WS(rs, 7)]), VSUB(T1i, T1h), ms, &(ii[WS(rs, 1)]));
Chris@82 365 ST(&(ri[WS(rs, 1)]), VADD(TL, TW), ms, &(ri[WS(rs, 1)]));
Chris@82 366 ST(&(ii[WS(rs, 3)]), VADD(T1h, T1i), ms, &(ii[WS(rs, 1)]));
Chris@82 367 }
Chris@82 368 }
Chris@82 369 }
Chris@82 370 }
Chris@82 371 VLEAVE();
Chris@82 372 }
Chris@82 373
Chris@82 374 static const tw_instr twinstr[] = {
Chris@82 375 VTW(0, 1),
Chris@82 376 VTW(0, 2),
Chris@82 377 VTW(0, 3),
Chris@82 378 VTW(0, 4),
Chris@82 379 VTW(0, 5),
Chris@82 380 VTW(0, 6),
Chris@82 381 VTW(0, 7),
Chris@82 382 {TW_NEXT, (2 * VL), 0}
Chris@82 383 };
Chris@82 384
Chris@82 385 static const ct_desc desc = { 8, XSIMD_STRING("t1sv_8"), twinstr, &GENUS, {52, 18, 14, 0}, 0, 0, 0 };
Chris@82 386
Chris@82 387 void XSIMD(codelet_t1sv_8) (planner *p) {
Chris@82 388 X(kdft_dit_register) (p, t1sv_8, &desc);
Chris@82 389 }
Chris@82 390 #endif