annotate src/fftw-3.3.3/dft/simd/common/t2bv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:39:09 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t2bv_16 -include t2b.h -sign 1 */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 87 FP additions, 64 FP multiplications,
Chris@10 32 * (or, 53 additions, 30 multiplications, 34 fused multiply/add),
Chris@10 33 * 61 stack variables, 3 constants, and 32 memory accesses
Chris@10 34 */
Chris@10 35 #include "t2b.h"
Chris@10 36
Chris@10 37 static void t2bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 42 {
Chris@10 43 INT m;
Chris@10 44 R *x;
Chris@10 45 x = ii;
Chris@10 46 for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@10 47 V TO, Ta, TJ, TP, T14, Tq, T1i, T10, T1b, T1l, T13, T1c, TR, Tl, T15;
Chris@10 48 V Tv;
Chris@10 49 {
Chris@10 50 V Tc, TW, T4, T19, T9, TD, TI, Tj, TZ, T1a, Te, Th, Tn, Tr, Tu;
Chris@10 51 V Tp;
Chris@10 52 {
Chris@10 53 V T1, T2, T5, T7;
Chris@10 54 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 55 T2 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 56 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 57 T7 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@10 58 {
Chris@10 59 V Tz, TG, TB, TE;
Chris@10 60 Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 61 TG = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 62 TB = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 63 TE = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@10 64 {
Chris@10 65 V Ti, TX, TY, Td, Tg, Tm, Tt, To;
Chris@10 66 {
Chris@10 67 V T3, T6, T8, TA, TH, TC, TF, Tb;
Chris@10 68 Tb = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 69 T3 = BYTW(&(W[TWVL * 14]), T2);
Chris@10 70 T6 = BYTW(&(W[TWVL * 6]), T5);
Chris@10 71 T8 = BYTW(&(W[TWVL * 22]), T7);
Chris@10 72 TA = BYTW(&(W[TWVL * 2]), Tz);
Chris@10 73 TH = BYTW(&(W[TWVL * 10]), TG);
Chris@10 74 TC = BYTW(&(W[TWVL * 18]), TB);
Chris@10 75 TF = BYTW(&(W[TWVL * 26]), TE);
Chris@10 76 Tc = BYTW(&(W[0]), Tb);
Chris@10 77 TW = VSUB(T1, T3);
Chris@10 78 T4 = VADD(T1, T3);
Chris@10 79 T19 = VSUB(T6, T8);
Chris@10 80 T9 = VADD(T6, T8);
Chris@10 81 Ti = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@10 82 TD = VADD(TA, TC);
Chris@10 83 TX = VSUB(TA, TC);
Chris@10 84 TI = VADD(TF, TH);
Chris@10 85 TY = VSUB(TF, TH);
Chris@10 86 }
Chris@10 87 Td = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 88 Tg = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 89 Tm = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@10 90 Tj = BYTW(&(W[TWVL * 24]), Ti);
Chris@10 91 Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 92 To = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 93 TZ = VADD(TX, TY);
Chris@10 94 T1a = VSUB(TX, TY);
Chris@10 95 Te = BYTW(&(W[TWVL * 16]), Td);
Chris@10 96 Th = BYTW(&(W[TWVL * 8]), Tg);
Chris@10 97 Tn = BYTW(&(W[TWVL * 28]), Tm);
Chris@10 98 Tr = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 99 Tu = BYTW(&(W[TWVL * 20]), Tt);
Chris@10 100 Tp = BYTW(&(W[TWVL * 12]), To);
Chris@10 101 }
Chris@10 102 }
Chris@10 103 }
Chris@10 104 {
Chris@10 105 V Tf, T11, Tk, T12, Ts;
Chris@10 106 TO = VADD(T4, T9);
Chris@10 107 Ta = VSUB(T4, T9);
Chris@10 108 TJ = VSUB(TD, TI);
Chris@10 109 TP = VADD(TD, TI);
Chris@10 110 Tf = VADD(Tc, Te);
Chris@10 111 T11 = VSUB(Tc, Te);
Chris@10 112 Tk = VADD(Th, Tj);
Chris@10 113 T12 = VSUB(Th, Tj);
Chris@10 114 Ts = BYTW(&(W[TWVL * 4]), Tr);
Chris@10 115 T14 = VSUB(Tn, Tp);
Chris@10 116 Tq = VADD(Tn, Tp);
Chris@10 117 T1i = VFNMS(LDK(KP707106781), TZ, TW);
Chris@10 118 T10 = VFMA(LDK(KP707106781), TZ, TW);
Chris@10 119 T1b = VFMA(LDK(KP707106781), T1a, T19);
Chris@10 120 T1l = VFNMS(LDK(KP707106781), T1a, T19);
Chris@10 121 T13 = VFNMS(LDK(KP414213562), T12, T11);
Chris@10 122 T1c = VFMA(LDK(KP414213562), T11, T12);
Chris@10 123 TR = VADD(Tf, Tk);
Chris@10 124 Tl = VSUB(Tf, Tk);
Chris@10 125 T15 = VSUB(Tu, Ts);
Chris@10 126 Tv = VADD(Ts, Tu);
Chris@10 127 }
Chris@10 128 }
Chris@10 129 {
Chris@10 130 V T1d, T16, TS, Tw, TU, TQ;
Chris@10 131 T1d = VFMA(LDK(KP414213562), T14, T15);
Chris@10 132 T16 = VFNMS(LDK(KP414213562), T15, T14);
Chris@10 133 TS = VADD(Tq, Tv);
Chris@10 134 Tw = VSUB(Tq, Tv);
Chris@10 135 TU = VADD(TO, TP);
Chris@10 136 TQ = VSUB(TO, TP);
Chris@10 137 {
Chris@10 138 V T1e, T1j, T17, T1m;
Chris@10 139 T1e = VSUB(T1c, T1d);
Chris@10 140 T1j = VADD(T1c, T1d);
Chris@10 141 T17 = VADD(T13, T16);
Chris@10 142 T1m = VSUB(T13, T16);
Chris@10 143 {
Chris@10 144 V TV, TT, TK, Tx;
Chris@10 145 TV = VADD(TR, TS);
Chris@10 146 TT = VSUB(TR, TS);
Chris@10 147 TK = VSUB(Tl, Tw);
Chris@10 148 Tx = VADD(Tl, Tw);
Chris@10 149 {
Chris@10 150 V T1h, T1f, T1o, T1k;
Chris@10 151 T1h = VFMA(LDK(KP923879532), T1e, T1b);
Chris@10 152 T1f = VFNMS(LDK(KP923879532), T1e, T1b);
Chris@10 153 T1o = VFMA(LDK(KP923879532), T1j, T1i);
Chris@10 154 T1k = VFNMS(LDK(KP923879532), T1j, T1i);
Chris@10 155 {
Chris@10 156 V T1g, T18, T1p, T1n;
Chris@10 157 T1g = VFMA(LDK(KP923879532), T17, T10);
Chris@10 158 T18 = VFNMS(LDK(KP923879532), T17, T10);
Chris@10 159 T1p = VFNMS(LDK(KP923879532), T1m, T1l);
Chris@10 160 T1n = VFMA(LDK(KP923879532), T1m, T1l);
Chris@10 161 ST(&(x[WS(rs, 8)]), VSUB(TU, TV), ms, &(x[0]));
Chris@10 162 ST(&(x[0]), VADD(TU, TV), ms, &(x[0]));
Chris@10 163 ST(&(x[WS(rs, 4)]), VFMAI(TT, TQ), ms, &(x[0]));
Chris@10 164 ST(&(x[WS(rs, 12)]), VFNMSI(TT, TQ), ms, &(x[0]));
Chris@10 165 {
Chris@10 166 V TN, TL, TM, Ty;
Chris@10 167 TN = VFMA(LDK(KP707106781), TK, TJ);
Chris@10 168 TL = VFNMS(LDK(KP707106781), TK, TJ);
Chris@10 169 TM = VFMA(LDK(KP707106781), Tx, Ta);
Chris@10 170 Ty = VFNMS(LDK(KP707106781), Tx, Ta);
Chris@10 171 ST(&(x[WS(rs, 15)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
Chris@10 172 ST(&(x[WS(rs, 1)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
Chris@10 173 ST(&(x[WS(rs, 9)]), VFMAI(T1f, T18), ms, &(x[WS(rs, 1)]));
Chris@10 174 ST(&(x[WS(rs, 7)]), VFNMSI(T1f, T18), ms, &(x[WS(rs, 1)]));
Chris@10 175 ST(&(x[WS(rs, 3)]), VFNMSI(T1p, T1o), ms, &(x[WS(rs, 1)]));
Chris@10 176 ST(&(x[WS(rs, 13)]), VFMAI(T1p, T1o), ms, &(x[WS(rs, 1)]));
Chris@10 177 ST(&(x[WS(rs, 11)]), VFNMSI(T1n, T1k), ms, &(x[WS(rs, 1)]));
Chris@10 178 ST(&(x[WS(rs, 5)]), VFMAI(T1n, T1k), ms, &(x[WS(rs, 1)]));
Chris@10 179 ST(&(x[WS(rs, 2)]), VFMAI(TN, TM), ms, &(x[0]));
Chris@10 180 ST(&(x[WS(rs, 14)]), VFNMSI(TN, TM), ms, &(x[0]));
Chris@10 181 ST(&(x[WS(rs, 10)]), VFMAI(TL, Ty), ms, &(x[0]));
Chris@10 182 ST(&(x[WS(rs, 6)]), VFNMSI(TL, Ty), ms, &(x[0]));
Chris@10 183 }
Chris@10 184 }
Chris@10 185 }
Chris@10 186 }
Chris@10 187 }
Chris@10 188 }
Chris@10 189 }
Chris@10 190 }
Chris@10 191 VLEAVE();
Chris@10 192 }
Chris@10 193
Chris@10 194 static const tw_instr twinstr[] = {
Chris@10 195 VTW(0, 1),
Chris@10 196 VTW(0, 2),
Chris@10 197 VTW(0, 3),
Chris@10 198 VTW(0, 4),
Chris@10 199 VTW(0, 5),
Chris@10 200 VTW(0, 6),
Chris@10 201 VTW(0, 7),
Chris@10 202 VTW(0, 8),
Chris@10 203 VTW(0, 9),
Chris@10 204 VTW(0, 10),
Chris@10 205 VTW(0, 11),
Chris@10 206 VTW(0, 12),
Chris@10 207 VTW(0, 13),
Chris@10 208 VTW(0, 14),
Chris@10 209 VTW(0, 15),
Chris@10 210 {TW_NEXT, VL, 0}
Chris@10 211 };
Chris@10 212
Chris@10 213 static const ct_desc desc = { 16, XSIMD_STRING("t2bv_16"), twinstr, &GENUS, {53, 30, 34, 0}, 0, 0, 0 };
Chris@10 214
Chris@10 215 void XSIMD(codelet_t2bv_16) (planner *p) {
Chris@10 216 X(kdft_dit_register) (p, t2bv_16, &desc);
Chris@10 217 }
Chris@10 218 #else /* HAVE_FMA */
Chris@10 219
Chris@10 220 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t2bv_16 -include t2b.h -sign 1 */
Chris@10 221
Chris@10 222 /*
Chris@10 223 * This function contains 87 FP additions, 42 FP multiplications,
Chris@10 224 * (or, 83 additions, 38 multiplications, 4 fused multiply/add),
Chris@10 225 * 36 stack variables, 3 constants, and 32 memory accesses
Chris@10 226 */
Chris@10 227 #include "t2b.h"
Chris@10 228
Chris@10 229 static void t2bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 230 {
Chris@10 231 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 232 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 233 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 234 {
Chris@10 235 INT m;
Chris@10 236 R *x;
Chris@10 237 x = ii;
Chris@10 238 for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@10 239 V TJ, T1b, TD, T1c, T17, T18, Ty, TK, T10, T11, T12, Tb, TM, T13, T14;
Chris@10 240 V T15, Tm, TN, TG, TI, TH;
Chris@10 241 TG = LD(&(x[0]), ms, &(x[0]));
Chris@10 242 TH = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 243 TI = BYTW(&(W[TWVL * 14]), TH);
Chris@10 244 TJ = VSUB(TG, TI);
Chris@10 245 T1b = VADD(TG, TI);
Chris@10 246 {
Chris@10 247 V TA, TC, Tz, TB;
Chris@10 248 Tz = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 249 TA = BYTW(&(W[TWVL * 6]), Tz);
Chris@10 250 TB = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@10 251 TC = BYTW(&(W[TWVL * 22]), TB);
Chris@10 252 TD = VSUB(TA, TC);
Chris@10 253 T1c = VADD(TA, TC);
Chris@10 254 }
Chris@10 255 {
Chris@10 256 V Tp, Tw, Tr, Tu, Ts, Tx;
Chris@10 257 {
Chris@10 258 V To, Tv, Tq, Tt;
Chris@10 259 To = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 260 Tp = BYTW(&(W[TWVL * 2]), To);
Chris@10 261 Tv = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 262 Tw = BYTW(&(W[TWVL * 10]), Tv);
Chris@10 263 Tq = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 264 Tr = BYTW(&(W[TWVL * 18]), Tq);
Chris@10 265 Tt = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@10 266 Tu = BYTW(&(W[TWVL * 26]), Tt);
Chris@10 267 }
Chris@10 268 T17 = VADD(Tp, Tr);
Chris@10 269 T18 = VADD(Tu, Tw);
Chris@10 270 Ts = VSUB(Tp, Tr);
Chris@10 271 Tx = VSUB(Tu, Tw);
Chris@10 272 Ty = VMUL(LDK(KP707106781), VSUB(Ts, Tx));
Chris@10 273 TK = VMUL(LDK(KP707106781), VADD(Ts, Tx));
Chris@10 274 }
Chris@10 275 {
Chris@10 276 V T2, T9, T4, T7, T5, Ta;
Chris@10 277 {
Chris@10 278 V T1, T8, T3, T6;
Chris@10 279 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 280 T2 = BYTW(&(W[0]), T1);
Chris@10 281 T8 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@10 282 T9 = BYTW(&(W[TWVL * 24]), T8);
Chris@10 283 T3 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 284 T4 = BYTW(&(W[TWVL * 16]), T3);
Chris@10 285 T6 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 286 T7 = BYTW(&(W[TWVL * 8]), T6);
Chris@10 287 }
Chris@10 288 T10 = VADD(T2, T4);
Chris@10 289 T11 = VADD(T7, T9);
Chris@10 290 T12 = VSUB(T10, T11);
Chris@10 291 T5 = VSUB(T2, T4);
Chris@10 292 Ta = VSUB(T7, T9);
Chris@10 293 Tb = VFNMS(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), T5));
Chris@10 294 TM = VFMA(LDK(KP382683432), T5, VMUL(LDK(KP923879532), Ta));
Chris@10 295 }
Chris@10 296 {
Chris@10 297 V Td, Tk, Tf, Ti, Tg, Tl;
Chris@10 298 {
Chris@10 299 V Tc, Tj, Te, Th;
Chris@10 300 Tc = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@10 301 Td = BYTW(&(W[TWVL * 28]), Tc);
Chris@10 302 Tj = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 303 Tk = BYTW(&(W[TWVL * 20]), Tj);
Chris@10 304 Te = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 305 Tf = BYTW(&(W[TWVL * 12]), Te);
Chris@10 306 Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 307 Ti = BYTW(&(W[TWVL * 4]), Th);
Chris@10 308 }
Chris@10 309 T13 = VADD(Td, Tf);
Chris@10 310 T14 = VADD(Ti, Tk);
Chris@10 311 T15 = VSUB(T13, T14);
Chris@10 312 Tg = VSUB(Td, Tf);
Chris@10 313 Tl = VSUB(Ti, Tk);
Chris@10 314 Tm = VFMA(LDK(KP923879532), Tg, VMUL(LDK(KP382683432), Tl));
Chris@10 315 TN = VFNMS(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), Tl));
Chris@10 316 }
Chris@10 317 {
Chris@10 318 V T1a, T1g, T1f, T1h;
Chris@10 319 {
Chris@10 320 V T16, T19, T1d, T1e;
Chris@10 321 T16 = VMUL(LDK(KP707106781), VSUB(T12, T15));
Chris@10 322 T19 = VSUB(T17, T18);
Chris@10 323 T1a = VBYI(VSUB(T16, T19));
Chris@10 324 T1g = VBYI(VADD(T19, T16));
Chris@10 325 T1d = VSUB(T1b, T1c);
Chris@10 326 T1e = VMUL(LDK(KP707106781), VADD(T12, T15));
Chris@10 327 T1f = VSUB(T1d, T1e);
Chris@10 328 T1h = VADD(T1d, T1e);
Chris@10 329 }
Chris@10 330 ST(&(x[WS(rs, 6)]), VADD(T1a, T1f), ms, &(x[0]));
Chris@10 331 ST(&(x[WS(rs, 14)]), VSUB(T1h, T1g), ms, &(x[0]));
Chris@10 332 ST(&(x[WS(rs, 10)]), VSUB(T1f, T1a), ms, &(x[0]));
Chris@10 333 ST(&(x[WS(rs, 2)]), VADD(T1g, T1h), ms, &(x[0]));
Chris@10 334 }
Chris@10 335 {
Chris@10 336 V T1k, T1o, T1n, T1p;
Chris@10 337 {
Chris@10 338 V T1i, T1j, T1l, T1m;
Chris@10 339 T1i = VADD(T1b, T1c);
Chris@10 340 T1j = VADD(T17, T18);
Chris@10 341 T1k = VSUB(T1i, T1j);
Chris@10 342 T1o = VADD(T1i, T1j);
Chris@10 343 T1l = VADD(T10, T11);
Chris@10 344 T1m = VADD(T13, T14);
Chris@10 345 T1n = VBYI(VSUB(T1l, T1m));
Chris@10 346 T1p = VADD(T1l, T1m);
Chris@10 347 }
Chris@10 348 ST(&(x[WS(rs, 12)]), VSUB(T1k, T1n), ms, &(x[0]));
Chris@10 349 ST(&(x[0]), VADD(T1o, T1p), ms, &(x[0]));
Chris@10 350 ST(&(x[WS(rs, 4)]), VADD(T1k, T1n), ms, &(x[0]));
Chris@10 351 ST(&(x[WS(rs, 8)]), VSUB(T1o, T1p), ms, &(x[0]));
Chris@10 352 }
Chris@10 353 {
Chris@10 354 V TF, TQ, TP, TR;
Chris@10 355 {
Chris@10 356 V Tn, TE, TL, TO;
Chris@10 357 Tn = VSUB(Tb, Tm);
Chris@10 358 TE = VSUB(Ty, TD);
Chris@10 359 TF = VBYI(VSUB(Tn, TE));
Chris@10 360 TQ = VBYI(VADD(TE, Tn));
Chris@10 361 TL = VSUB(TJ, TK);
Chris@10 362 TO = VSUB(TM, TN);
Chris@10 363 TP = VSUB(TL, TO);
Chris@10 364 TR = VADD(TL, TO);
Chris@10 365 }
Chris@10 366 ST(&(x[WS(rs, 5)]), VADD(TF, TP), ms, &(x[WS(rs, 1)]));
Chris@10 367 ST(&(x[WS(rs, 13)]), VSUB(TR, TQ), ms, &(x[WS(rs, 1)]));
Chris@10 368 ST(&(x[WS(rs, 11)]), VSUB(TP, TF), ms, &(x[WS(rs, 1)]));
Chris@10 369 ST(&(x[WS(rs, 3)]), VADD(TQ, TR), ms, &(x[WS(rs, 1)]));
Chris@10 370 }
Chris@10 371 {
Chris@10 372 V TU, TY, TX, TZ;
Chris@10 373 {
Chris@10 374 V TS, TT, TV, TW;
Chris@10 375 TS = VADD(TJ, TK);
Chris@10 376 TT = VADD(Tb, Tm);
Chris@10 377 TU = VADD(TS, TT);
Chris@10 378 TY = VSUB(TS, TT);
Chris@10 379 TV = VADD(TD, Ty);
Chris@10 380 TW = VADD(TM, TN);
Chris@10 381 TX = VBYI(VADD(TV, TW));
Chris@10 382 TZ = VBYI(VSUB(TW, TV));
Chris@10 383 }
Chris@10 384 ST(&(x[WS(rs, 15)]), VSUB(TU, TX), ms, &(x[WS(rs, 1)]));
Chris@10 385 ST(&(x[WS(rs, 7)]), VADD(TY, TZ), ms, &(x[WS(rs, 1)]));
Chris@10 386 ST(&(x[WS(rs, 1)]), VADD(TU, TX), ms, &(x[WS(rs, 1)]));
Chris@10 387 ST(&(x[WS(rs, 9)]), VSUB(TY, TZ), ms, &(x[WS(rs, 1)]));
Chris@10 388 }
Chris@10 389 }
Chris@10 390 }
Chris@10 391 VLEAVE();
Chris@10 392 }
Chris@10 393
Chris@10 394 static const tw_instr twinstr[] = {
Chris@10 395 VTW(0, 1),
Chris@10 396 VTW(0, 2),
Chris@10 397 VTW(0, 3),
Chris@10 398 VTW(0, 4),
Chris@10 399 VTW(0, 5),
Chris@10 400 VTW(0, 6),
Chris@10 401 VTW(0, 7),
Chris@10 402 VTW(0, 8),
Chris@10 403 VTW(0, 9),
Chris@10 404 VTW(0, 10),
Chris@10 405 VTW(0, 11),
Chris@10 406 VTW(0, 12),
Chris@10 407 VTW(0, 13),
Chris@10 408 VTW(0, 14),
Chris@10 409 VTW(0, 15),
Chris@10 410 {TW_NEXT, VL, 0}
Chris@10 411 };
Chris@10 412
Chris@10 413 static const ct_desc desc = { 16, XSIMD_STRING("t2bv_16"), twinstr, &GENUS, {83, 38, 4, 0}, 0, 0, 0 };
Chris@10 414
Chris@10 415 void XSIMD(codelet_t2bv_16) (planner *p) {
Chris@10 416 X(kdft_dit_register) (p, t2bv_16, &desc);
Chris@10 417 }
Chris@10 418 #endif /* HAVE_FMA */