annotate src/fftw-3.3.8/dft/simd/common/t3bv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:06 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 16 -name t3bv_16 -include dft/simd/t3b.h -sign 1 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 98 FP additions, 86 FP multiplications,
Chris@82 32 * (or, 64 additions, 52 multiplications, 34 fused multiply/add),
Chris@82 33 * 51 stack variables, 3 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t3b.h"
Chris@82 36
Chris@82 37 static void t3bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 41 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 42 {
Chris@82 43 INT m;
Chris@82 44 R *x;
Chris@82 45 x = ii;
Chris@82 46 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 47 V T2, T8, T9, Tx, Tu, TR, T3, T4, TN, TU, Tc, Tm, Ty, TE, Tp;
Chris@82 48 T2 = LDW(&(W[0]));
Chris@82 49 T8 = LDW(&(W[TWVL * 2]));
Chris@82 50 T9 = VZMUL(T2, T8);
Chris@82 51 Tx = VZMULJ(T2, T8);
Chris@82 52 Tu = LDW(&(W[TWVL * 6]));
Chris@82 53 TR = VZMULJ(T2, Tu);
Chris@82 54 T3 = LDW(&(W[TWVL * 4]));
Chris@82 55 T4 = VZMULJ(T2, T3);
Chris@82 56 TN = VZMUL(T2, T3);
Chris@82 57 TU = VZMULJ(T8, T3);
Chris@82 58 Tc = VZMUL(T8, T3);
Chris@82 59 Tm = VZMULJ(T9, T3);
Chris@82 60 Ty = VZMULJ(Tx, T3);
Chris@82 61 TE = VZMUL(Tx, T3);
Chris@82 62 Tp = VZMUL(T9, T3);
Chris@82 63 {
Chris@82 64 V T7, T1b, Tf, T1o, TQ, TX, T1e, T1p, Tl, Ts, Tt, T1i, T1r, TB, TH;
Chris@82 65 V TI, T1l, T1s, T1, T6, T5;
Chris@82 66 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 67 T5 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 68 T6 = VZMUL(T4, T5);
Chris@82 69 T7 = VADD(T1, T6);
Chris@82 70 T1b = VSUB(T1, T6);
Chris@82 71 {
Chris@82 72 V Tb, Te, Ta, Td;
Chris@82 73 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 74 Tb = VZMUL(T9, Ta);
Chris@82 75 Td = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 76 Te = VZMUL(Tc, Td);
Chris@82 77 Tf = VADD(Tb, Te);
Chris@82 78 T1o = VSUB(Tb, Te);
Chris@82 79 }
Chris@82 80 {
Chris@82 81 V TM, TW, TP, TT, T1c, T1d;
Chris@82 82 {
Chris@82 83 V TL, TV, TO, TS;
Chris@82 84 TL = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 85 TM = VZMUL(Tx, TL);
Chris@82 86 TV = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 87 TW = VZMUL(TU, TV);
Chris@82 88 TO = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 89 TP = VZMUL(TN, TO);
Chris@82 90 TS = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 91 TT = VZMUL(TR, TS);
Chris@82 92 }
Chris@82 93 TQ = VADD(TM, TP);
Chris@82 94 TX = VADD(TT, TW);
Chris@82 95 T1c = VSUB(TM, TP);
Chris@82 96 T1d = VSUB(TT, TW);
Chris@82 97 T1e = VADD(T1c, T1d);
Chris@82 98 T1p = VSUB(T1c, T1d);
Chris@82 99 }
Chris@82 100 {
Chris@82 101 V Ti, Tr, Tk, To, T1g, T1h;
Chris@82 102 {
Chris@82 103 V Th, Tq, Tj, Tn;
Chris@82 104 Th = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 105 Ti = VZMUL(T2, Th);
Chris@82 106 Tq = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 107 Tr = VZMUL(Tp, Tq);
Chris@82 108 Tj = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 109 Tk = VZMUL(T3, Tj);
Chris@82 110 Tn = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 111 To = VZMUL(Tm, Tn);
Chris@82 112 }
Chris@82 113 Tl = VADD(Ti, Tk);
Chris@82 114 Ts = VADD(To, Tr);
Chris@82 115 Tt = VSUB(Tl, Ts);
Chris@82 116 T1g = VSUB(Ti, Tk);
Chris@82 117 T1h = VSUB(To, Tr);
Chris@82 118 T1i = VFNMS(LDK(KP414213562), T1h, T1g);
Chris@82 119 T1r = VFMA(LDK(KP414213562), T1g, T1h);
Chris@82 120 }
Chris@82 121 {
Chris@82 122 V Tw, TG, TA, TD, T1j, T1k;
Chris@82 123 {
Chris@82 124 V Tv, TF, Tz, TC;
Chris@82 125 Tv = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 126 Tw = VZMUL(Tu, Tv);
Chris@82 127 TF = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 128 TG = VZMUL(TE, TF);
Chris@82 129 Tz = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 130 TA = VZMUL(Ty, Tz);
Chris@82 131 TC = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 132 TD = VZMUL(T8, TC);
Chris@82 133 }
Chris@82 134 TB = VADD(Tw, TA);
Chris@82 135 TH = VADD(TD, TG);
Chris@82 136 TI = VSUB(TB, TH);
Chris@82 137 T1j = VSUB(Tw, TA);
Chris@82 138 T1k = VSUB(TG, TD);
Chris@82 139 T1l = VFNMS(LDK(KP414213562), T1k, T1j);
Chris@82 140 T1s = VFMA(LDK(KP414213562), T1j, T1k);
Chris@82 141 }
Chris@82 142 {
Chris@82 143 V TK, T11, T10, T12;
Chris@82 144 {
Chris@82 145 V Tg, TJ, TY, TZ;
Chris@82 146 Tg = VSUB(T7, Tf);
Chris@82 147 TJ = VADD(Tt, TI);
Chris@82 148 TK = VFNMS(LDK(KP707106781), TJ, Tg);
Chris@82 149 T11 = VFMA(LDK(KP707106781), TJ, Tg);
Chris@82 150 TY = VSUB(TQ, TX);
Chris@82 151 TZ = VSUB(Tt, TI);
Chris@82 152 T10 = VFNMS(LDK(KP707106781), TZ, TY);
Chris@82 153 T12 = VFMA(LDK(KP707106781), TZ, TY);
Chris@82 154 }
Chris@82 155 ST(&(x[WS(rs, 6)]), VFNMSI(T10, TK), ms, &(x[0]));
Chris@82 156 ST(&(x[WS(rs, 14)]), VFNMSI(T12, T11), ms, &(x[0]));
Chris@82 157 ST(&(x[WS(rs, 10)]), VFMAI(T10, TK), ms, &(x[0]));
Chris@82 158 ST(&(x[WS(rs, 2)]), VFMAI(T12, T11), ms, &(x[0]));
Chris@82 159 }
Chris@82 160 {
Chris@82 161 V T1z, T1D, T1C, T1E;
Chris@82 162 {
Chris@82 163 V T1x, T1y, T1A, T1B;
Chris@82 164 T1x = VFNMS(LDK(KP707106781), T1e, T1b);
Chris@82 165 T1y = VADD(T1r, T1s);
Chris@82 166 T1z = VFNMS(LDK(KP923879532), T1y, T1x);
Chris@82 167 T1D = VFMA(LDK(KP923879532), T1y, T1x);
Chris@82 168 T1A = VFNMS(LDK(KP707106781), T1p, T1o);
Chris@82 169 T1B = VSUB(T1i, T1l);
Chris@82 170 T1C = VFMA(LDK(KP923879532), T1B, T1A);
Chris@82 171 T1E = VFNMS(LDK(KP923879532), T1B, T1A);
Chris@82 172 }
Chris@82 173 ST(&(x[WS(rs, 5)]), VFMAI(T1C, T1z), ms, &(x[WS(rs, 1)]));
Chris@82 174 ST(&(x[WS(rs, 13)]), VFMAI(T1E, T1D), ms, &(x[WS(rs, 1)]));
Chris@82 175 ST(&(x[WS(rs, 11)]), VFNMSI(T1C, T1z), ms, &(x[WS(rs, 1)]));
Chris@82 176 ST(&(x[WS(rs, 3)]), VFNMSI(T1E, T1D), ms, &(x[WS(rs, 1)]));
Chris@82 177 }
Chris@82 178 {
Chris@82 179 V T15, T19, T18, T1a;
Chris@82 180 {
Chris@82 181 V T13, T14, T16, T17;
Chris@82 182 T13 = VADD(T7, Tf);
Chris@82 183 T14 = VADD(TQ, TX);
Chris@82 184 T15 = VSUB(T13, T14);
Chris@82 185 T19 = VADD(T13, T14);
Chris@82 186 T16 = VADD(Tl, Ts);
Chris@82 187 T17 = VADD(TB, TH);
Chris@82 188 T18 = VSUB(T16, T17);
Chris@82 189 T1a = VADD(T16, T17);
Chris@82 190 }
Chris@82 191 ST(&(x[WS(rs, 12)]), VFNMSI(T18, T15), ms, &(x[0]));
Chris@82 192 ST(&(x[0]), VADD(T19, T1a), ms, &(x[0]));
Chris@82 193 ST(&(x[WS(rs, 4)]), VFMAI(T18, T15), ms, &(x[0]));
Chris@82 194 ST(&(x[WS(rs, 8)]), VSUB(T19, T1a), ms, &(x[0]));
Chris@82 195 }
Chris@82 196 {
Chris@82 197 V T1n, T1v, T1u, T1w;
Chris@82 198 {
Chris@82 199 V T1f, T1m, T1q, T1t;
Chris@82 200 T1f = VFMA(LDK(KP707106781), T1e, T1b);
Chris@82 201 T1m = VADD(T1i, T1l);
Chris@82 202 T1n = VFNMS(LDK(KP923879532), T1m, T1f);
Chris@82 203 T1v = VFMA(LDK(KP923879532), T1m, T1f);
Chris@82 204 T1q = VFMA(LDK(KP707106781), T1p, T1o);
Chris@82 205 T1t = VSUB(T1r, T1s);
Chris@82 206 T1u = VFNMS(LDK(KP923879532), T1t, T1q);
Chris@82 207 T1w = VFMA(LDK(KP923879532), T1t, T1q);
Chris@82 208 }
Chris@82 209 ST(&(x[WS(rs, 7)]), VFNMSI(T1u, T1n), ms, &(x[WS(rs, 1)]));
Chris@82 210 ST(&(x[WS(rs, 1)]), VFMAI(T1w, T1v), ms, &(x[WS(rs, 1)]));
Chris@82 211 ST(&(x[WS(rs, 9)]), VFMAI(T1u, T1n), ms, &(x[WS(rs, 1)]));
Chris@82 212 ST(&(x[WS(rs, 15)]), VFNMSI(T1w, T1v), ms, &(x[WS(rs, 1)]));
Chris@82 213 }
Chris@82 214 }
Chris@82 215 }
Chris@82 216 }
Chris@82 217 VLEAVE();
Chris@82 218 }
Chris@82 219
Chris@82 220 static const tw_instr twinstr[] = {
Chris@82 221 VTW(0, 1),
Chris@82 222 VTW(0, 3),
Chris@82 223 VTW(0, 9),
Chris@82 224 VTW(0, 15),
Chris@82 225 {TW_NEXT, VL, 0}
Chris@82 226 };
Chris@82 227
Chris@82 228 static const ct_desc desc = { 16, XSIMD_STRING("t3bv_16"), twinstr, &GENUS, {64, 52, 34, 0}, 0, 0, 0 };
Chris@82 229
Chris@82 230 void XSIMD(codelet_t3bv_16) (planner *p) {
Chris@82 231 X(kdft_dit_register) (p, t3bv_16, &desc);
Chris@82 232 }
Chris@82 233 #else
Chris@82 234
Chris@82 235 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 16 -name t3bv_16 -include dft/simd/t3b.h -sign 1 */
Chris@82 236
Chris@82 237 /*
Chris@82 238 * This function contains 98 FP additions, 64 FP multiplications,
Chris@82 239 * (or, 94 additions, 60 multiplications, 4 fused multiply/add),
Chris@82 240 * 51 stack variables, 3 constants, and 32 memory accesses
Chris@82 241 */
Chris@82 242 #include "dft/simd/t3b.h"
Chris@82 243
Chris@82 244 static void t3bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 245 {
Chris@82 246 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 247 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 248 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 249 {
Chris@82 250 INT m;
Chris@82 251 R *x;
Chris@82 252 x = ii;
Chris@82 253 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 254 V T1, T8, T9, Tl, Ti, TE, T4, Ta, TO, TV, Td, Tm, TA, TH, Ts;
Chris@82 255 T1 = LDW(&(W[0]));
Chris@82 256 T8 = LDW(&(W[TWVL * 2]));
Chris@82 257 T9 = VZMUL(T1, T8);
Chris@82 258 Tl = VZMULJ(T1, T8);
Chris@82 259 Ti = LDW(&(W[TWVL * 6]));
Chris@82 260 TE = VZMULJ(T1, Ti);
Chris@82 261 T4 = LDW(&(W[TWVL * 4]));
Chris@82 262 Ta = VZMULJ(T9, T4);
Chris@82 263 TO = VZMUL(T8, T4);
Chris@82 264 TV = VZMULJ(T1, T4);
Chris@82 265 Td = VZMUL(T9, T4);
Chris@82 266 Tm = VZMULJ(Tl, T4);
Chris@82 267 TA = VZMUL(T1, T4);
Chris@82 268 TH = VZMULJ(T8, T4);
Chris@82 269 Ts = VZMUL(Tl, T4);
Chris@82 270 {
Chris@82 271 V TY, T1q, TR, T1r, T1m, T1n, TL, TZ, T1f, T1g, T1h, Th, T11, T1i, T1j;
Chris@82 272 V T1k, Tw, T12, TU, TX, TW;
Chris@82 273 TU = LD(&(x[0]), ms, &(x[0]));
Chris@82 274 TW = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 275 TX = VZMUL(TV, TW);
Chris@82 276 TY = VSUB(TU, TX);
Chris@82 277 T1q = VADD(TU, TX);
Chris@82 278 {
Chris@82 279 V TN, TQ, TM, TP;
Chris@82 280 TM = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 281 TN = VZMUL(T9, TM);
Chris@82 282 TP = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 283 TQ = VZMUL(TO, TP);
Chris@82 284 TR = VSUB(TN, TQ);
Chris@82 285 T1r = VADD(TN, TQ);
Chris@82 286 }
Chris@82 287 {
Chris@82 288 V Tz, TJ, TC, TG, TD, TK;
Chris@82 289 {
Chris@82 290 V Ty, TI, TB, TF;
Chris@82 291 Ty = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 292 Tz = VZMUL(Tl, Ty);
Chris@82 293 TI = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 294 TJ = VZMUL(TH, TI);
Chris@82 295 TB = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 296 TC = VZMUL(TA, TB);
Chris@82 297 TF = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 298 TG = VZMUL(TE, TF);
Chris@82 299 }
Chris@82 300 T1m = VADD(Tz, TC);
Chris@82 301 T1n = VADD(TG, TJ);
Chris@82 302 TD = VSUB(Tz, TC);
Chris@82 303 TK = VSUB(TG, TJ);
Chris@82 304 TL = VMUL(LDK(KP707106781), VSUB(TD, TK));
Chris@82 305 TZ = VMUL(LDK(KP707106781), VADD(TD, TK));
Chris@82 306 }
Chris@82 307 {
Chris@82 308 V T3, Tf, T6, Tc, T7, Tg;
Chris@82 309 {
Chris@82 310 V T2, Te, T5, Tb;
Chris@82 311 T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 312 T3 = VZMUL(T1, T2);
Chris@82 313 Te = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 314 Tf = VZMUL(Td, Te);
Chris@82 315 T5 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 316 T6 = VZMUL(T4, T5);
Chris@82 317 Tb = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 318 Tc = VZMUL(Ta, Tb);
Chris@82 319 }
Chris@82 320 T1f = VADD(T3, T6);
Chris@82 321 T1g = VADD(Tc, Tf);
Chris@82 322 T1h = VSUB(T1f, T1g);
Chris@82 323 T7 = VSUB(T3, T6);
Chris@82 324 Tg = VSUB(Tc, Tf);
Chris@82 325 Th = VFNMS(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), T7));
Chris@82 326 T11 = VFMA(LDK(KP382683432), T7, VMUL(LDK(KP923879532), Tg));
Chris@82 327 }
Chris@82 328 {
Chris@82 329 V Tk, Tu, To, Tr, Tp, Tv;
Chris@82 330 {
Chris@82 331 V Tj, Tt, Tn, Tq;
Chris@82 332 Tj = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 333 Tk = VZMUL(Ti, Tj);
Chris@82 334 Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 335 Tu = VZMUL(Ts, Tt);
Chris@82 336 Tn = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 337 To = VZMUL(Tm, Tn);
Chris@82 338 Tq = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 339 Tr = VZMUL(T8, Tq);
Chris@82 340 }
Chris@82 341 T1i = VADD(Tk, To);
Chris@82 342 T1j = VADD(Tr, Tu);
Chris@82 343 T1k = VSUB(T1i, T1j);
Chris@82 344 Tp = VSUB(Tk, To);
Chris@82 345 Tv = VSUB(Tr, Tu);
Chris@82 346 Tw = VFMA(LDK(KP923879532), Tp, VMUL(LDK(KP382683432), Tv));
Chris@82 347 T12 = VFNMS(LDK(KP382683432), Tp, VMUL(LDK(KP923879532), Tv));
Chris@82 348 }
Chris@82 349 {
Chris@82 350 V T1p, T1v, T1u, T1w;
Chris@82 351 {
Chris@82 352 V T1l, T1o, T1s, T1t;
Chris@82 353 T1l = VMUL(LDK(KP707106781), VSUB(T1h, T1k));
Chris@82 354 T1o = VSUB(T1m, T1n);
Chris@82 355 T1p = VBYI(VSUB(T1l, T1o));
Chris@82 356 T1v = VBYI(VADD(T1o, T1l));
Chris@82 357 T1s = VSUB(T1q, T1r);
Chris@82 358 T1t = VMUL(LDK(KP707106781), VADD(T1h, T1k));
Chris@82 359 T1u = VSUB(T1s, T1t);
Chris@82 360 T1w = VADD(T1s, T1t);
Chris@82 361 }
Chris@82 362 ST(&(x[WS(rs, 6)]), VADD(T1p, T1u), ms, &(x[0]));
Chris@82 363 ST(&(x[WS(rs, 14)]), VSUB(T1w, T1v), ms, &(x[0]));
Chris@82 364 ST(&(x[WS(rs, 10)]), VSUB(T1u, T1p), ms, &(x[0]));
Chris@82 365 ST(&(x[WS(rs, 2)]), VADD(T1v, T1w), ms, &(x[0]));
Chris@82 366 }
Chris@82 367 {
Chris@82 368 V T1z, T1D, T1C, T1E;
Chris@82 369 {
Chris@82 370 V T1x, T1y, T1A, T1B;
Chris@82 371 T1x = VADD(T1q, T1r);
Chris@82 372 T1y = VADD(T1m, T1n);
Chris@82 373 T1z = VSUB(T1x, T1y);
Chris@82 374 T1D = VADD(T1x, T1y);
Chris@82 375 T1A = VADD(T1f, T1g);
Chris@82 376 T1B = VADD(T1i, T1j);
Chris@82 377 T1C = VBYI(VSUB(T1A, T1B));
Chris@82 378 T1E = VADD(T1A, T1B);
Chris@82 379 }
Chris@82 380 ST(&(x[WS(rs, 12)]), VSUB(T1z, T1C), ms, &(x[0]));
Chris@82 381 ST(&(x[0]), VADD(T1D, T1E), ms, &(x[0]));
Chris@82 382 ST(&(x[WS(rs, 4)]), VADD(T1z, T1C), ms, &(x[0]));
Chris@82 383 ST(&(x[WS(rs, 8)]), VSUB(T1D, T1E), ms, &(x[0]));
Chris@82 384 }
Chris@82 385 {
Chris@82 386 V TT, T15, T14, T16;
Chris@82 387 {
Chris@82 388 V Tx, TS, T10, T13;
Chris@82 389 Tx = VSUB(Th, Tw);
Chris@82 390 TS = VSUB(TL, TR);
Chris@82 391 TT = VBYI(VSUB(Tx, TS));
Chris@82 392 T15 = VBYI(VADD(TS, Tx));
Chris@82 393 T10 = VSUB(TY, TZ);
Chris@82 394 T13 = VSUB(T11, T12);
Chris@82 395 T14 = VSUB(T10, T13);
Chris@82 396 T16 = VADD(T10, T13);
Chris@82 397 }
Chris@82 398 ST(&(x[WS(rs, 5)]), VADD(TT, T14), ms, &(x[WS(rs, 1)]));
Chris@82 399 ST(&(x[WS(rs, 13)]), VSUB(T16, T15), ms, &(x[WS(rs, 1)]));
Chris@82 400 ST(&(x[WS(rs, 11)]), VSUB(T14, TT), ms, &(x[WS(rs, 1)]));
Chris@82 401 ST(&(x[WS(rs, 3)]), VADD(T15, T16), ms, &(x[WS(rs, 1)]));
Chris@82 402 }
Chris@82 403 {
Chris@82 404 V T19, T1d, T1c, T1e;
Chris@82 405 {
Chris@82 406 V T17, T18, T1a, T1b;
Chris@82 407 T17 = VADD(TY, TZ);
Chris@82 408 T18 = VADD(Th, Tw);
Chris@82 409 T19 = VADD(T17, T18);
Chris@82 410 T1d = VSUB(T17, T18);
Chris@82 411 T1a = VADD(TR, TL);
Chris@82 412 T1b = VADD(T11, T12);
Chris@82 413 T1c = VBYI(VADD(T1a, T1b));
Chris@82 414 T1e = VBYI(VSUB(T1b, T1a));
Chris@82 415 }
Chris@82 416 ST(&(x[WS(rs, 15)]), VSUB(T19, T1c), ms, &(x[WS(rs, 1)]));
Chris@82 417 ST(&(x[WS(rs, 7)]), VADD(T1d, T1e), ms, &(x[WS(rs, 1)]));
Chris@82 418 ST(&(x[WS(rs, 1)]), VADD(T19, T1c), ms, &(x[WS(rs, 1)]));
Chris@82 419 ST(&(x[WS(rs, 9)]), VSUB(T1d, T1e), ms, &(x[WS(rs, 1)]));
Chris@82 420 }
Chris@82 421 }
Chris@82 422 }
Chris@82 423 }
Chris@82 424 VLEAVE();
Chris@82 425 }
Chris@82 426
Chris@82 427 static const tw_instr twinstr[] = {
Chris@82 428 VTW(0, 1),
Chris@82 429 VTW(0, 3),
Chris@82 430 VTW(0, 9),
Chris@82 431 VTW(0, 15),
Chris@82 432 {TW_NEXT, VL, 0}
Chris@82 433 };
Chris@82 434
Chris@82 435 static const ct_desc desc = { 16, XSIMD_STRING("t3bv_16"), twinstr, &GENUS, {94, 60, 4, 0}, 0, 0, 0 };
Chris@82 436
Chris@82 437 void XSIMD(codelet_t3bv_16) (planner *p) {
Chris@82 438 X(kdft_dit_register) (p, t3bv_16, &desc);
Chris@82 439 }
Chris@82 440 #endif