annotate src/fftw-3.3.5/dft/simd/common/t3bv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:44:46 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 16 -name t3bv_16 -include t3b.h -sign 1 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 98 FP additions, 86 FP multiplications,
Chris@42 32 * (or, 64 additions, 52 multiplications, 34 fused multiply/add),
Chris@42 33 * 70 stack variables, 3 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "t3b.h"
Chris@42 36
Chris@42 37 static void t3bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 42 {
Chris@42 43 INT m;
Chris@42 44 R *x;
Chris@42 45 x = ii;
Chris@42 46 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 47 V T13, Tg, TY, T14, T1A, T1q, T1f, T1x, T1r, T1i, Tt, T16, TB, T1j, T1k;
Chris@42 48 V TH;
Chris@42 49 {
Chris@42 50 V T2, T8, Tu, T3;
Chris@42 51 T2 = LDW(&(W[0]));
Chris@42 52 T8 = LDW(&(W[TWVL * 2]));
Chris@42 53 Tu = LDW(&(W[TWVL * 6]));
Chris@42 54 T3 = LDW(&(W[TWVL * 4]));
Chris@42 55 {
Chris@42 56 V Ty, T1o, Tf, T1b, T7, Tr, TQ, TX, T1g, Tl, To, Tw, TG, Tz, T1p;
Chris@42 57 V T1e, TC;
Chris@42 58 {
Chris@42 59 V T1, T5, Ta, Td;
Chris@42 60 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 61 T5 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 62 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 63 Td = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 64 {
Chris@42 65 V TR, TN, TM, TE, Tb, Tp, Tm, Te, T6, TW, TO, TS;
Chris@42 66 {
Chris@42 67 V TL, Tx, T9, TU, Tc, T4, TV;
Chris@42 68 TL = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 69 Tx = VZMULJ(T2, T8);
Chris@42 70 T9 = VZMUL(T2, T8);
Chris@42 71 TR = VZMULJ(T2, Tu);
Chris@42 72 TU = VZMULJ(T8, T3);
Chris@42 73 Tc = VZMUL(T8, T3);
Chris@42 74 T4 = VZMULJ(T2, T3);
Chris@42 75 TN = VZMUL(T2, T3);
Chris@42 76 TV = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 77 TM = VZMUL(Tx, TL);
Chris@42 78 Ty = VZMULJ(Tx, T3);
Chris@42 79 TE = VZMUL(Tx, T3);
Chris@42 80 Tb = VZMUL(T9, Ta);
Chris@42 81 Tp = VZMUL(T9, T3);
Chris@42 82 Tm = VZMULJ(T9, T3);
Chris@42 83 Te = VZMUL(Tc, Td);
Chris@42 84 T6 = VZMUL(T4, T5);
Chris@42 85 TW = VZMUL(TU, TV);
Chris@42 86 }
Chris@42 87 TO = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 88 TS = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 89 {
Chris@42 90 V TP, TT, Ti, Tk, Tn, Th, Tq, Tj;
Chris@42 91 Th = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 92 Tq = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 93 Tj = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 94 T1o = VSUB(Tb, Te);
Chris@42 95 Tf = VADD(Tb, Te);
Chris@42 96 T1b = VSUB(T1, T6);
Chris@42 97 T7 = VADD(T1, T6);
Chris@42 98 TP = VZMUL(TN, TO);
Chris@42 99 TT = VZMUL(TR, TS);
Chris@42 100 Ti = VZMUL(T2, Th);
Chris@42 101 Tr = VZMUL(Tp, Tq);
Chris@42 102 Tk = VZMUL(T3, Tj);
Chris@42 103 Tn = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 104 {
Chris@42 105 V T1c, T1d, Tv, TF;
Chris@42 106 Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 107 TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 108 T1c = VSUB(TM, TP);
Chris@42 109 TQ = VADD(TM, TP);
Chris@42 110 T1d = VSUB(TT, TW);
Chris@42 111 TX = VADD(TT, TW);
Chris@42 112 T1g = VSUB(Ti, Tk);
Chris@42 113 Tl = VADD(Ti, Tk);
Chris@42 114 To = VZMUL(Tm, Tn);
Chris@42 115 Tw = VZMUL(Tu, Tv);
Chris@42 116 TG = VZMUL(TE, TF);
Chris@42 117 Tz = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 118 T1p = VSUB(T1c, T1d);
Chris@42 119 T1e = VADD(T1c, T1d);
Chris@42 120 TC = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 121 }
Chris@42 122 }
Chris@42 123 }
Chris@42 124 }
Chris@42 125 {
Chris@42 126 V T1h, Ts, TA, TD;
Chris@42 127 T13 = VADD(T7, Tf);
Chris@42 128 Tg = VSUB(T7, Tf);
Chris@42 129 T1h = VSUB(To, Tr);
Chris@42 130 Ts = VADD(To, Tr);
Chris@42 131 TY = VSUB(TQ, TX);
Chris@42 132 T14 = VADD(TQ, TX);
Chris@42 133 TA = VZMUL(Ty, Tz);
Chris@42 134 T1A = VFNMS(LDK(KP707106781), T1p, T1o);
Chris@42 135 T1q = VFMA(LDK(KP707106781), T1p, T1o);
Chris@42 136 T1f = VFMA(LDK(KP707106781), T1e, T1b);
Chris@42 137 T1x = VFNMS(LDK(KP707106781), T1e, T1b);
Chris@42 138 TD = VZMUL(T8, TC);
Chris@42 139 T1r = VFMA(LDK(KP414213562), T1g, T1h);
Chris@42 140 T1i = VFNMS(LDK(KP414213562), T1h, T1g);
Chris@42 141 Tt = VSUB(Tl, Ts);
Chris@42 142 T16 = VADD(Tl, Ts);
Chris@42 143 TB = VADD(Tw, TA);
Chris@42 144 T1j = VSUB(Tw, TA);
Chris@42 145 T1k = VSUB(TG, TD);
Chris@42 146 TH = VADD(TD, TG);
Chris@42 147 }
Chris@42 148 }
Chris@42 149 }
Chris@42 150 {
Chris@42 151 V T15, T19, T1l, T1s, TI, T17;
Chris@42 152 T15 = VSUB(T13, T14);
Chris@42 153 T19 = VADD(T13, T14);
Chris@42 154 T1l = VFNMS(LDK(KP414213562), T1k, T1j);
Chris@42 155 T1s = VFMA(LDK(KP414213562), T1j, T1k);
Chris@42 156 TI = VSUB(TB, TH);
Chris@42 157 T17 = VADD(TB, TH);
Chris@42 158 {
Chris@42 159 V T1y, T1t, T1B, T1m;
Chris@42 160 T1y = VADD(T1r, T1s);
Chris@42 161 T1t = VSUB(T1r, T1s);
Chris@42 162 T1B = VSUB(T1i, T1l);
Chris@42 163 T1m = VADD(T1i, T1l);
Chris@42 164 {
Chris@42 165 V T18, T1a, TJ, TZ;
Chris@42 166 T18 = VSUB(T16, T17);
Chris@42 167 T1a = VADD(T16, T17);
Chris@42 168 TJ = VADD(Tt, TI);
Chris@42 169 TZ = VSUB(Tt, TI);
Chris@42 170 {
Chris@42 171 V T1u, T1w, T1z, T1D;
Chris@42 172 T1u = VFNMS(LDK(KP923879532), T1t, T1q);
Chris@42 173 T1w = VFMA(LDK(KP923879532), T1t, T1q);
Chris@42 174 T1z = VFNMS(LDK(KP923879532), T1y, T1x);
Chris@42 175 T1D = VFMA(LDK(KP923879532), T1y, T1x);
Chris@42 176 {
Chris@42 177 V T1n, T1v, T1C, T1E;
Chris@42 178 T1n = VFNMS(LDK(KP923879532), T1m, T1f);
Chris@42 179 T1v = VFMA(LDK(KP923879532), T1m, T1f);
Chris@42 180 T1C = VFMA(LDK(KP923879532), T1B, T1A);
Chris@42 181 T1E = VFNMS(LDK(KP923879532), T1B, T1A);
Chris@42 182 ST(&(x[WS(rs, 8)]), VSUB(T19, T1a), ms, &(x[0]));
Chris@42 183 ST(&(x[0]), VADD(T19, T1a), ms, &(x[0]));
Chris@42 184 ST(&(x[WS(rs, 4)]), VFMAI(T18, T15), ms, &(x[0]));
Chris@42 185 ST(&(x[WS(rs, 12)]), VFNMSI(T18, T15), ms, &(x[0]));
Chris@42 186 {
Chris@42 187 V T10, T12, TK, T11;
Chris@42 188 T10 = VFNMS(LDK(KP707106781), TZ, TY);
Chris@42 189 T12 = VFMA(LDK(KP707106781), TZ, TY);
Chris@42 190 TK = VFNMS(LDK(KP707106781), TJ, Tg);
Chris@42 191 T11 = VFMA(LDK(KP707106781), TJ, Tg);
Chris@42 192 ST(&(x[WS(rs, 15)]), VFNMSI(T1w, T1v), ms, &(x[WS(rs, 1)]));
Chris@42 193 ST(&(x[WS(rs, 1)]), VFMAI(T1w, T1v), ms, &(x[WS(rs, 1)]));
Chris@42 194 ST(&(x[WS(rs, 9)]), VFMAI(T1u, T1n), ms, &(x[WS(rs, 1)]));
Chris@42 195 ST(&(x[WS(rs, 7)]), VFNMSI(T1u, T1n), ms, &(x[WS(rs, 1)]));
Chris@42 196 ST(&(x[WS(rs, 3)]), VFNMSI(T1E, T1D), ms, &(x[WS(rs, 1)]));
Chris@42 197 ST(&(x[WS(rs, 13)]), VFMAI(T1E, T1D), ms, &(x[WS(rs, 1)]));
Chris@42 198 ST(&(x[WS(rs, 11)]), VFNMSI(T1C, T1z), ms, &(x[WS(rs, 1)]));
Chris@42 199 ST(&(x[WS(rs, 5)]), VFMAI(T1C, T1z), ms, &(x[WS(rs, 1)]));
Chris@42 200 ST(&(x[WS(rs, 2)]), VFMAI(T12, T11), ms, &(x[0]));
Chris@42 201 ST(&(x[WS(rs, 14)]), VFNMSI(T12, T11), ms, &(x[0]));
Chris@42 202 ST(&(x[WS(rs, 10)]), VFMAI(T10, TK), ms, &(x[0]));
Chris@42 203 ST(&(x[WS(rs, 6)]), VFNMSI(T10, TK), ms, &(x[0]));
Chris@42 204 }
Chris@42 205 }
Chris@42 206 }
Chris@42 207 }
Chris@42 208 }
Chris@42 209 }
Chris@42 210 }
Chris@42 211 }
Chris@42 212 VLEAVE();
Chris@42 213 }
Chris@42 214
Chris@42 215 static const tw_instr twinstr[] = {
Chris@42 216 VTW(0, 1),
Chris@42 217 VTW(0, 3),
Chris@42 218 VTW(0, 9),
Chris@42 219 VTW(0, 15),
Chris@42 220 {TW_NEXT, VL, 0}
Chris@42 221 };
Chris@42 222
Chris@42 223 static const ct_desc desc = { 16, XSIMD_STRING("t3bv_16"), twinstr, &GENUS, {64, 52, 34, 0}, 0, 0, 0 };
Chris@42 224
Chris@42 225 void XSIMD(codelet_t3bv_16) (planner *p) {
Chris@42 226 X(kdft_dit_register) (p, t3bv_16, &desc);
Chris@42 227 }
Chris@42 228 #else /* HAVE_FMA */
Chris@42 229
Chris@42 230 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 16 -name t3bv_16 -include t3b.h -sign 1 */
Chris@42 231
Chris@42 232 /*
Chris@42 233 * This function contains 98 FP additions, 64 FP multiplications,
Chris@42 234 * (or, 94 additions, 60 multiplications, 4 fused multiply/add),
Chris@42 235 * 51 stack variables, 3 constants, and 32 memory accesses
Chris@42 236 */
Chris@42 237 #include "t3b.h"
Chris@42 238
Chris@42 239 static void t3bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 240 {
Chris@42 241 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 242 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 243 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 244 {
Chris@42 245 INT m;
Chris@42 246 R *x;
Chris@42 247 x = ii;
Chris@42 248 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 249 V T1, T8, T9, Tl, Ti, TE, T4, Ta, TO, TV, Td, Tm, TA, TH, Ts;
Chris@42 250 T1 = LDW(&(W[0]));
Chris@42 251 T8 = LDW(&(W[TWVL * 2]));
Chris@42 252 T9 = VZMUL(T1, T8);
Chris@42 253 Tl = VZMULJ(T1, T8);
Chris@42 254 Ti = LDW(&(W[TWVL * 6]));
Chris@42 255 TE = VZMULJ(T1, Ti);
Chris@42 256 T4 = LDW(&(W[TWVL * 4]));
Chris@42 257 Ta = VZMULJ(T9, T4);
Chris@42 258 TO = VZMUL(T8, T4);
Chris@42 259 TV = VZMULJ(T1, T4);
Chris@42 260 Td = VZMUL(T9, T4);
Chris@42 261 Tm = VZMULJ(Tl, T4);
Chris@42 262 TA = VZMUL(T1, T4);
Chris@42 263 TH = VZMULJ(T8, T4);
Chris@42 264 Ts = VZMUL(Tl, T4);
Chris@42 265 {
Chris@42 266 V TY, T1q, TR, T1r, T1m, T1n, TL, TZ, T1f, T1g, T1h, Th, T11, T1i, T1j;
Chris@42 267 V T1k, Tw, T12, TU, TX, TW;
Chris@42 268 TU = LD(&(x[0]), ms, &(x[0]));
Chris@42 269 TW = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 270 TX = VZMUL(TV, TW);
Chris@42 271 TY = VSUB(TU, TX);
Chris@42 272 T1q = VADD(TU, TX);
Chris@42 273 {
Chris@42 274 V TN, TQ, TM, TP;
Chris@42 275 TM = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 276 TN = VZMUL(T9, TM);
Chris@42 277 TP = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 278 TQ = VZMUL(TO, TP);
Chris@42 279 TR = VSUB(TN, TQ);
Chris@42 280 T1r = VADD(TN, TQ);
Chris@42 281 }
Chris@42 282 {
Chris@42 283 V Tz, TJ, TC, TG, TD, TK;
Chris@42 284 {
Chris@42 285 V Ty, TI, TB, TF;
Chris@42 286 Ty = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 287 Tz = VZMUL(Tl, Ty);
Chris@42 288 TI = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 289 TJ = VZMUL(TH, TI);
Chris@42 290 TB = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 291 TC = VZMUL(TA, TB);
Chris@42 292 TF = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 293 TG = VZMUL(TE, TF);
Chris@42 294 }
Chris@42 295 T1m = VADD(Tz, TC);
Chris@42 296 T1n = VADD(TG, TJ);
Chris@42 297 TD = VSUB(Tz, TC);
Chris@42 298 TK = VSUB(TG, TJ);
Chris@42 299 TL = VMUL(LDK(KP707106781), VSUB(TD, TK));
Chris@42 300 TZ = VMUL(LDK(KP707106781), VADD(TD, TK));
Chris@42 301 }
Chris@42 302 {
Chris@42 303 V T3, Tf, T6, Tc, T7, Tg;
Chris@42 304 {
Chris@42 305 V T2, Te, T5, Tb;
Chris@42 306 T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 307 T3 = VZMUL(T1, T2);
Chris@42 308 Te = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 309 Tf = VZMUL(Td, Te);
Chris@42 310 T5 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 311 T6 = VZMUL(T4, T5);
Chris@42 312 Tb = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 313 Tc = VZMUL(Ta, Tb);
Chris@42 314 }
Chris@42 315 T1f = VADD(T3, T6);
Chris@42 316 T1g = VADD(Tc, Tf);
Chris@42 317 T1h = VSUB(T1f, T1g);
Chris@42 318 T7 = VSUB(T3, T6);
Chris@42 319 Tg = VSUB(Tc, Tf);
Chris@42 320 Th = VFNMS(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), T7));
Chris@42 321 T11 = VFMA(LDK(KP382683432), T7, VMUL(LDK(KP923879532), Tg));
Chris@42 322 }
Chris@42 323 {
Chris@42 324 V Tk, Tu, To, Tr, Tp, Tv;
Chris@42 325 {
Chris@42 326 V Tj, Tt, Tn, Tq;
Chris@42 327 Tj = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 328 Tk = VZMUL(Ti, Tj);
Chris@42 329 Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 330 Tu = VZMUL(Ts, Tt);
Chris@42 331 Tn = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 332 To = VZMUL(Tm, Tn);
Chris@42 333 Tq = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 334 Tr = VZMUL(T8, Tq);
Chris@42 335 }
Chris@42 336 T1i = VADD(Tk, To);
Chris@42 337 T1j = VADD(Tr, Tu);
Chris@42 338 T1k = VSUB(T1i, T1j);
Chris@42 339 Tp = VSUB(Tk, To);
Chris@42 340 Tv = VSUB(Tr, Tu);
Chris@42 341 Tw = VFMA(LDK(KP923879532), Tp, VMUL(LDK(KP382683432), Tv));
Chris@42 342 T12 = VFNMS(LDK(KP382683432), Tp, VMUL(LDK(KP923879532), Tv));
Chris@42 343 }
Chris@42 344 {
Chris@42 345 V T1p, T1v, T1u, T1w;
Chris@42 346 {
Chris@42 347 V T1l, T1o, T1s, T1t;
Chris@42 348 T1l = VMUL(LDK(KP707106781), VSUB(T1h, T1k));
Chris@42 349 T1o = VSUB(T1m, T1n);
Chris@42 350 T1p = VBYI(VSUB(T1l, T1o));
Chris@42 351 T1v = VBYI(VADD(T1o, T1l));
Chris@42 352 T1s = VSUB(T1q, T1r);
Chris@42 353 T1t = VMUL(LDK(KP707106781), VADD(T1h, T1k));
Chris@42 354 T1u = VSUB(T1s, T1t);
Chris@42 355 T1w = VADD(T1s, T1t);
Chris@42 356 }
Chris@42 357 ST(&(x[WS(rs, 6)]), VADD(T1p, T1u), ms, &(x[0]));
Chris@42 358 ST(&(x[WS(rs, 14)]), VSUB(T1w, T1v), ms, &(x[0]));
Chris@42 359 ST(&(x[WS(rs, 10)]), VSUB(T1u, T1p), ms, &(x[0]));
Chris@42 360 ST(&(x[WS(rs, 2)]), VADD(T1v, T1w), ms, &(x[0]));
Chris@42 361 }
Chris@42 362 {
Chris@42 363 V T1z, T1D, T1C, T1E;
Chris@42 364 {
Chris@42 365 V T1x, T1y, T1A, T1B;
Chris@42 366 T1x = VADD(T1q, T1r);
Chris@42 367 T1y = VADD(T1m, T1n);
Chris@42 368 T1z = VSUB(T1x, T1y);
Chris@42 369 T1D = VADD(T1x, T1y);
Chris@42 370 T1A = VADD(T1f, T1g);
Chris@42 371 T1B = VADD(T1i, T1j);
Chris@42 372 T1C = VBYI(VSUB(T1A, T1B));
Chris@42 373 T1E = VADD(T1A, T1B);
Chris@42 374 }
Chris@42 375 ST(&(x[WS(rs, 12)]), VSUB(T1z, T1C), ms, &(x[0]));
Chris@42 376 ST(&(x[0]), VADD(T1D, T1E), ms, &(x[0]));
Chris@42 377 ST(&(x[WS(rs, 4)]), VADD(T1z, T1C), ms, &(x[0]));
Chris@42 378 ST(&(x[WS(rs, 8)]), VSUB(T1D, T1E), ms, &(x[0]));
Chris@42 379 }
Chris@42 380 {
Chris@42 381 V TT, T15, T14, T16;
Chris@42 382 {
Chris@42 383 V Tx, TS, T10, T13;
Chris@42 384 Tx = VSUB(Th, Tw);
Chris@42 385 TS = VSUB(TL, TR);
Chris@42 386 TT = VBYI(VSUB(Tx, TS));
Chris@42 387 T15 = VBYI(VADD(TS, Tx));
Chris@42 388 T10 = VSUB(TY, TZ);
Chris@42 389 T13 = VSUB(T11, T12);
Chris@42 390 T14 = VSUB(T10, T13);
Chris@42 391 T16 = VADD(T10, T13);
Chris@42 392 }
Chris@42 393 ST(&(x[WS(rs, 5)]), VADD(TT, T14), ms, &(x[WS(rs, 1)]));
Chris@42 394 ST(&(x[WS(rs, 13)]), VSUB(T16, T15), ms, &(x[WS(rs, 1)]));
Chris@42 395 ST(&(x[WS(rs, 11)]), VSUB(T14, TT), ms, &(x[WS(rs, 1)]));
Chris@42 396 ST(&(x[WS(rs, 3)]), VADD(T15, T16), ms, &(x[WS(rs, 1)]));
Chris@42 397 }
Chris@42 398 {
Chris@42 399 V T19, T1d, T1c, T1e;
Chris@42 400 {
Chris@42 401 V T17, T18, T1a, T1b;
Chris@42 402 T17 = VADD(TY, TZ);
Chris@42 403 T18 = VADD(Th, Tw);
Chris@42 404 T19 = VADD(T17, T18);
Chris@42 405 T1d = VSUB(T17, T18);
Chris@42 406 T1a = VADD(TR, TL);
Chris@42 407 T1b = VADD(T11, T12);
Chris@42 408 T1c = VBYI(VADD(T1a, T1b));
Chris@42 409 T1e = VBYI(VSUB(T1b, T1a));
Chris@42 410 }
Chris@42 411 ST(&(x[WS(rs, 15)]), VSUB(T19, T1c), ms, &(x[WS(rs, 1)]));
Chris@42 412 ST(&(x[WS(rs, 7)]), VADD(T1d, T1e), ms, &(x[WS(rs, 1)]));
Chris@42 413 ST(&(x[WS(rs, 1)]), VADD(T19, T1c), ms, &(x[WS(rs, 1)]));
Chris@42 414 ST(&(x[WS(rs, 9)]), VSUB(T1d, T1e), ms, &(x[WS(rs, 1)]));
Chris@42 415 }
Chris@42 416 }
Chris@42 417 }
Chris@42 418 }
Chris@42 419 VLEAVE();
Chris@42 420 }
Chris@42 421
Chris@42 422 static const tw_instr twinstr[] = {
Chris@42 423 VTW(0, 1),
Chris@42 424 VTW(0, 3),
Chris@42 425 VTW(0, 9),
Chris@42 426 VTW(0, 15),
Chris@42 427 {TW_NEXT, VL, 0}
Chris@42 428 };
Chris@42 429
Chris@42 430 static const ct_desc desc = { 16, XSIMD_STRING("t3bv_16"), twinstr, &GENUS, {94, 60, 4, 0}, 0, 0, 0 };
Chris@42 431
Chris@42 432 void XSIMD(codelet_t3bv_16) (planner *p) {
Chris@42 433 X(kdft_dit_register) (p, t3bv_16, &desc);
Chris@42 434 }
Chris@42 435 #endif /* HAVE_FMA */