annotate src/fftw-3.3.5/dft/simd/common/n2fv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:40:15 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n2fv_16 -with-ostride 2 -include n2f.h -store-multiple 2 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 72 FP additions, 34 FP multiplications,
Chris@42 32 * (or, 38 additions, 0 multiplications, 34 fused multiply/add),
Chris@42 33 * 62 stack variables, 3 constants, and 40 memory accesses
Chris@42 34 */
Chris@42 35 #include "n2f.h"
Chris@42 36
Chris@42 37 static void n2fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 42 {
Chris@42 43 INT i;
Chris@42 44 const R *xi;
Chris@42 45 R *xo;
Chris@42 46 xi = ri;
Chris@42 47 xo = ro;
Chris@42 48 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@42 49 V T7, Tu, TF, TB, T13, TL, TO, TX, TC, Te, TP, Th, TQ, Tk, TW;
Chris@42 50 V T16;
Chris@42 51 {
Chris@42 52 V TH, TU, Tz, Tf, TK, TV, TA, TM, Ta, TN, Td, Tg, Ti, Tj;
Chris@42 53 {
Chris@42 54 V T1, T2, T4, T5, To, Tp, Tr, Ts;
Chris@42 55 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 56 T2 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 57 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 58 T5 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 59 To = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 60 Tp = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 61 Tr = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 62 Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 63 {
Chris@42 64 V T8, TJ, Tq, TI, Tt, T9, Tb, Tc, T3, T6;
Chris@42 65 T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 66 TH = VSUB(T1, T2);
Chris@42 67 T3 = VADD(T1, T2);
Chris@42 68 TU = VSUB(T4, T5);
Chris@42 69 T6 = VADD(T4, T5);
Chris@42 70 TJ = VSUB(To, Tp);
Chris@42 71 Tq = VADD(To, Tp);
Chris@42 72 TI = VSUB(Tr, Ts);
Chris@42 73 Tt = VADD(Tr, Ts);
Chris@42 74 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 75 Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 76 Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 77 T7 = VSUB(T3, T6);
Chris@42 78 Tz = VADD(T3, T6);
Chris@42 79 Tf = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 80 TK = VADD(TI, TJ);
Chris@42 81 TV = VSUB(TJ, TI);
Chris@42 82 TA = VADD(Tt, Tq);
Chris@42 83 Tu = VSUB(Tq, Tt);
Chris@42 84 TM = VSUB(T8, T9);
Chris@42 85 Ta = VADD(T8, T9);
Chris@42 86 TN = VSUB(Tb, Tc);
Chris@42 87 Td = VADD(Tb, Tc);
Chris@42 88 Tg = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 89 Ti = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 90 Tj = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 91 }
Chris@42 92 }
Chris@42 93 TF = VSUB(Tz, TA);
Chris@42 94 TB = VADD(Tz, TA);
Chris@42 95 T13 = VFNMS(LDK(KP707106781), TK, TH);
Chris@42 96 TL = VFMA(LDK(KP707106781), TK, TH);
Chris@42 97 TO = VFNMS(LDK(KP414213562), TN, TM);
Chris@42 98 TX = VFMA(LDK(KP414213562), TM, TN);
Chris@42 99 TC = VADD(Ta, Td);
Chris@42 100 Te = VSUB(Ta, Td);
Chris@42 101 TP = VSUB(Tf, Tg);
Chris@42 102 Th = VADD(Tf, Tg);
Chris@42 103 TQ = VSUB(Tj, Ti);
Chris@42 104 Tk = VADD(Ti, Tj);
Chris@42 105 TW = VFNMS(LDK(KP707106781), TV, TU);
Chris@42 106 T16 = VFMA(LDK(KP707106781), TV, TU);
Chris@42 107 }
Chris@42 108 {
Chris@42 109 V TY, TR, Tl, TD;
Chris@42 110 TY = VFMA(LDK(KP414213562), TP, TQ);
Chris@42 111 TR = VFNMS(LDK(KP414213562), TQ, TP);
Chris@42 112 Tl = VSUB(Th, Tk);
Chris@42 113 TD = VADD(Th, Tk);
Chris@42 114 {
Chris@42 115 V TS, T17, TZ, T14;
Chris@42 116 TS = VADD(TO, TR);
Chris@42 117 T17 = VSUB(TR, TO);
Chris@42 118 TZ = VSUB(TX, TY);
Chris@42 119 T14 = VADD(TX, TY);
Chris@42 120 {
Chris@42 121 V TE, TG, Tm, Tv;
Chris@42 122 TE = VADD(TC, TD);
Chris@42 123 TG = VSUB(TD, TC);
Chris@42 124 Tm = VADD(Te, Tl);
Chris@42 125 Tv = VSUB(Tl, Te);
Chris@42 126 {
Chris@42 127 V T18, T1a, TT, T11;
Chris@42 128 T18 = VFNMS(LDK(KP923879532), T17, T16);
Chris@42 129 T1a = VFMA(LDK(KP923879532), T17, T16);
Chris@42 130 TT = VFNMS(LDK(KP923879532), TS, TL);
Chris@42 131 T11 = VFMA(LDK(KP923879532), TS, TL);
Chris@42 132 {
Chris@42 133 V T15, T19, T10, T12;
Chris@42 134 T15 = VFNMS(LDK(KP923879532), T14, T13);
Chris@42 135 T19 = VFMA(LDK(KP923879532), T14, T13);
Chris@42 136 T10 = VFNMS(LDK(KP923879532), TZ, TW);
Chris@42 137 T12 = VFMA(LDK(KP923879532), TZ, TW);
Chris@42 138 {
Chris@42 139 V T1b, T1c, T1d, T1e;
Chris@42 140 T1b = VFMAI(TG, TF);
Chris@42 141 STM2(&(xo[8]), T1b, ovs, &(xo[0]));
Chris@42 142 T1c = VFNMSI(TG, TF);
Chris@42 143 STM2(&(xo[24]), T1c, ovs, &(xo[0]));
Chris@42 144 T1d = VADD(TB, TE);
Chris@42 145 STM2(&(xo[0]), T1d, ovs, &(xo[0]));
Chris@42 146 T1e = VSUB(TB, TE);
Chris@42 147 STM2(&(xo[16]), T1e, ovs, &(xo[0]));
Chris@42 148 {
Chris@42 149 V Tw, Ty, Tn, Tx;
Chris@42 150 Tw = VFNMS(LDK(KP707106781), Tv, Tu);
Chris@42 151 Ty = VFMA(LDK(KP707106781), Tv, Tu);
Chris@42 152 Tn = VFNMS(LDK(KP707106781), Tm, T7);
Chris@42 153 Tx = VFMA(LDK(KP707106781), Tm, T7);
Chris@42 154 {
Chris@42 155 V T1f, T1g, T1h, T1i;
Chris@42 156 T1f = VFMAI(T1a, T19);
Chris@42 157 STM2(&(xo[6]), T1f, ovs, &(xo[2]));
Chris@42 158 T1g = VFNMSI(T1a, T19);
Chris@42 159 STM2(&(xo[26]), T1g, ovs, &(xo[2]));
Chris@42 160 STN2(&(xo[24]), T1c, T1g, ovs);
Chris@42 161 T1h = VFMAI(T18, T15);
Chris@42 162 STM2(&(xo[22]), T1h, ovs, &(xo[2]));
Chris@42 163 T1i = VFNMSI(T18, T15);
Chris@42 164 STM2(&(xo[10]), T1i, ovs, &(xo[2]));
Chris@42 165 STN2(&(xo[8]), T1b, T1i, ovs);
Chris@42 166 {
Chris@42 167 V T1j, T1k, T1l, T1m;
Chris@42 168 T1j = VFNMSI(T12, T11);
Chris@42 169 STM2(&(xo[2]), T1j, ovs, &(xo[2]));
Chris@42 170 STN2(&(xo[0]), T1d, T1j, ovs);
Chris@42 171 T1k = VFMAI(T12, T11);
Chris@42 172 STM2(&(xo[30]), T1k, ovs, &(xo[2]));
Chris@42 173 T1l = VFMAI(T10, TT);
Chris@42 174 STM2(&(xo[14]), T1l, ovs, &(xo[2]));
Chris@42 175 T1m = VFNMSI(T10, TT);
Chris@42 176 STM2(&(xo[18]), T1m, ovs, &(xo[2]));
Chris@42 177 STN2(&(xo[16]), T1e, T1m, ovs);
Chris@42 178 {
Chris@42 179 V T1n, T1o, T1p, T1q;
Chris@42 180 T1n = VFNMSI(Ty, Tx);
Chris@42 181 STM2(&(xo[28]), T1n, ovs, &(xo[0]));
Chris@42 182 STN2(&(xo[28]), T1n, T1k, ovs);
Chris@42 183 T1o = VFMAI(Ty, Tx);
Chris@42 184 STM2(&(xo[4]), T1o, ovs, &(xo[0]));
Chris@42 185 STN2(&(xo[4]), T1o, T1f, ovs);
Chris@42 186 T1p = VFMAI(Tw, Tn);
Chris@42 187 STM2(&(xo[20]), T1p, ovs, &(xo[0]));
Chris@42 188 STN2(&(xo[20]), T1p, T1h, ovs);
Chris@42 189 T1q = VFNMSI(Tw, Tn);
Chris@42 190 STM2(&(xo[12]), T1q, ovs, &(xo[0]));
Chris@42 191 STN2(&(xo[12]), T1q, T1l, ovs);
Chris@42 192 }
Chris@42 193 }
Chris@42 194 }
Chris@42 195 }
Chris@42 196 }
Chris@42 197 }
Chris@42 198 }
Chris@42 199 }
Chris@42 200 }
Chris@42 201 }
Chris@42 202 }
Chris@42 203 }
Chris@42 204 VLEAVE();
Chris@42 205 }
Chris@42 206
Chris@42 207 static const kdft_desc desc = { 16, XSIMD_STRING("n2fv_16"), {38, 0, 34, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 208
Chris@42 209 void XSIMD(codelet_n2fv_16) (planner *p) {
Chris@42 210 X(kdft_register) (p, n2fv_16, &desc);
Chris@42 211 }
Chris@42 212
Chris@42 213 #else /* HAVE_FMA */
Chris@42 214
Chris@42 215 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n2fv_16 -with-ostride 2 -include n2f.h -store-multiple 2 */
Chris@42 216
Chris@42 217 /*
Chris@42 218 * This function contains 72 FP additions, 12 FP multiplications,
Chris@42 219 * (or, 68 additions, 8 multiplications, 4 fused multiply/add),
Chris@42 220 * 38 stack variables, 3 constants, and 40 memory accesses
Chris@42 221 */
Chris@42 222 #include "n2f.h"
Chris@42 223
Chris@42 224 static void n2fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 225 {
Chris@42 226 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 227 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 228 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 229 {
Chris@42 230 INT i;
Chris@42 231 const R *xi;
Chris@42 232 R *xo;
Chris@42 233 xi = ri;
Chris@42 234 xo = ro;
Chris@42 235 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@42 236 V Tp, T13, Tu, TN, Tm, T14, Tv, TY, T7, T17, Ty, TT, Te, T16, Tx;
Chris@42 237 V TQ;
Chris@42 238 {
Chris@42 239 V Tn, To, TM, Ts, Tt, TL;
Chris@42 240 Tn = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 241 To = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 242 TM = VADD(Tn, To);
Chris@42 243 Ts = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 244 Tt = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 245 TL = VADD(Ts, Tt);
Chris@42 246 Tp = VSUB(Tn, To);
Chris@42 247 T13 = VADD(TL, TM);
Chris@42 248 Tu = VSUB(Ts, Tt);
Chris@42 249 TN = VSUB(TL, TM);
Chris@42 250 }
Chris@42 251 {
Chris@42 252 V Ti, TW, Tl, TX;
Chris@42 253 {
Chris@42 254 V Tg, Th, Tj, Tk;
Chris@42 255 Tg = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 256 Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 257 Ti = VSUB(Tg, Th);
Chris@42 258 TW = VADD(Tg, Th);
Chris@42 259 Tj = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 260 Tk = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 261 Tl = VSUB(Tj, Tk);
Chris@42 262 TX = VADD(Tj, Tk);
Chris@42 263 }
Chris@42 264 Tm = VMUL(LDK(KP707106781), VSUB(Ti, Tl));
Chris@42 265 T14 = VADD(TX, TW);
Chris@42 266 Tv = VMUL(LDK(KP707106781), VADD(Tl, Ti));
Chris@42 267 TY = VSUB(TW, TX);
Chris@42 268 }
Chris@42 269 {
Chris@42 270 V T3, TR, T6, TS;
Chris@42 271 {
Chris@42 272 V T1, T2, T4, T5;
Chris@42 273 T1 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 274 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 275 T3 = VSUB(T1, T2);
Chris@42 276 TR = VADD(T1, T2);
Chris@42 277 T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 278 T5 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 279 T6 = VSUB(T4, T5);
Chris@42 280 TS = VADD(T4, T5);
Chris@42 281 }
Chris@42 282 T7 = VFNMS(LDK(KP923879532), T6, VMUL(LDK(KP382683432), T3));
Chris@42 283 T17 = VADD(TR, TS);
Chris@42 284 Ty = VFMA(LDK(KP923879532), T3, VMUL(LDK(KP382683432), T6));
Chris@42 285 TT = VSUB(TR, TS);
Chris@42 286 }
Chris@42 287 {
Chris@42 288 V Ta, TO, Td, TP;
Chris@42 289 {
Chris@42 290 V T8, T9, Tb, Tc;
Chris@42 291 T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 292 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 293 Ta = VSUB(T8, T9);
Chris@42 294 TO = VADD(T8, T9);
Chris@42 295 Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 296 Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 297 Td = VSUB(Tb, Tc);
Chris@42 298 TP = VADD(Tb, Tc);
Chris@42 299 }
Chris@42 300 Te = VFMA(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), Td));
Chris@42 301 T16 = VADD(TO, TP);
Chris@42 302 Tx = VFNMS(LDK(KP382683432), Td, VMUL(LDK(KP923879532), Ta));
Chris@42 303 TQ = VSUB(TO, TP);
Chris@42 304 }
Chris@42 305 {
Chris@42 306 V T1b, T1c, T1d, T1e;
Chris@42 307 {
Chris@42 308 V T15, T18, T19, T1a;
Chris@42 309 T15 = VADD(T13, T14);
Chris@42 310 T18 = VADD(T16, T17);
Chris@42 311 T1b = VSUB(T15, T18);
Chris@42 312 STM2(&(xo[16]), T1b, ovs, &(xo[0]));
Chris@42 313 T1c = VADD(T15, T18);
Chris@42 314 STM2(&(xo[0]), T1c, ovs, &(xo[0]));
Chris@42 315 T19 = VSUB(T13, T14);
Chris@42 316 T1a = VBYI(VSUB(T17, T16));
Chris@42 317 T1d = VSUB(T19, T1a);
Chris@42 318 STM2(&(xo[24]), T1d, ovs, &(xo[0]));
Chris@42 319 T1e = VADD(T19, T1a);
Chris@42 320 STM2(&(xo[8]), T1e, ovs, &(xo[0]));
Chris@42 321 }
Chris@42 322 {
Chris@42 323 V T1f, T1g, T1h, T1i;
Chris@42 324 {
Chris@42 325 V TV, T11, T10, T12, TU, TZ;
Chris@42 326 TU = VMUL(LDK(KP707106781), VADD(TQ, TT));
Chris@42 327 TV = VADD(TN, TU);
Chris@42 328 T11 = VSUB(TN, TU);
Chris@42 329 TZ = VMUL(LDK(KP707106781), VSUB(TT, TQ));
Chris@42 330 T10 = VBYI(VADD(TY, TZ));
Chris@42 331 T12 = VBYI(VSUB(TZ, TY));
Chris@42 332 T1f = VSUB(TV, T10);
Chris@42 333 STM2(&(xo[28]), T1f, ovs, &(xo[0]));
Chris@42 334 T1g = VADD(T11, T12);
Chris@42 335 STM2(&(xo[12]), T1g, ovs, &(xo[0]));
Chris@42 336 T1h = VADD(TV, T10);
Chris@42 337 STM2(&(xo[4]), T1h, ovs, &(xo[0]));
Chris@42 338 T1i = VSUB(T11, T12);
Chris@42 339 STM2(&(xo[20]), T1i, ovs, &(xo[0]));
Chris@42 340 }
Chris@42 341 {
Chris@42 342 V Tr, TB, TA, TC;
Chris@42 343 {
Chris@42 344 V Tf, Tq, Tw, Tz;
Chris@42 345 Tf = VSUB(T7, Te);
Chris@42 346 Tq = VSUB(Tm, Tp);
Chris@42 347 Tr = VBYI(VSUB(Tf, Tq));
Chris@42 348 TB = VBYI(VADD(Tq, Tf));
Chris@42 349 Tw = VADD(Tu, Tv);
Chris@42 350 Tz = VADD(Tx, Ty);
Chris@42 351 TA = VSUB(Tw, Tz);
Chris@42 352 TC = VADD(Tw, Tz);
Chris@42 353 }
Chris@42 354 {
Chris@42 355 V T1j, T1k, T1l, T1m;
Chris@42 356 T1j = VADD(Tr, TA);
Chris@42 357 STM2(&(xo[14]), T1j, ovs, &(xo[2]));
Chris@42 358 STN2(&(xo[12]), T1g, T1j, ovs);
Chris@42 359 T1k = VSUB(TC, TB);
Chris@42 360 STM2(&(xo[30]), T1k, ovs, &(xo[2]));
Chris@42 361 STN2(&(xo[28]), T1f, T1k, ovs);
Chris@42 362 T1l = VSUB(TA, Tr);
Chris@42 363 STM2(&(xo[18]), T1l, ovs, &(xo[2]));
Chris@42 364 STN2(&(xo[16]), T1b, T1l, ovs);
Chris@42 365 T1m = VADD(TB, TC);
Chris@42 366 STM2(&(xo[2]), T1m, ovs, &(xo[2]));
Chris@42 367 STN2(&(xo[0]), T1c, T1m, ovs);
Chris@42 368 }
Chris@42 369 }
Chris@42 370 {
Chris@42 371 V TF, TJ, TI, TK;
Chris@42 372 {
Chris@42 373 V TD, TE, TG, TH;
Chris@42 374 TD = VSUB(Tu, Tv);
Chris@42 375 TE = VADD(Te, T7);
Chris@42 376 TF = VADD(TD, TE);
Chris@42 377 TJ = VSUB(TD, TE);
Chris@42 378 TG = VADD(Tp, Tm);
Chris@42 379 TH = VSUB(Ty, Tx);
Chris@42 380 TI = VBYI(VADD(TG, TH));
Chris@42 381 TK = VBYI(VSUB(TH, TG));
Chris@42 382 }
Chris@42 383 {
Chris@42 384 V T1n, T1o, T1p, T1q;
Chris@42 385 T1n = VSUB(TF, TI);
Chris@42 386 STM2(&(xo[26]), T1n, ovs, &(xo[2]));
Chris@42 387 STN2(&(xo[24]), T1d, T1n, ovs);
Chris@42 388 T1o = VADD(TJ, TK);
Chris@42 389 STM2(&(xo[10]), T1o, ovs, &(xo[2]));
Chris@42 390 STN2(&(xo[8]), T1e, T1o, ovs);
Chris@42 391 T1p = VADD(TF, TI);
Chris@42 392 STM2(&(xo[6]), T1p, ovs, &(xo[2]));
Chris@42 393 STN2(&(xo[4]), T1h, T1p, ovs);
Chris@42 394 T1q = VSUB(TJ, TK);
Chris@42 395 STM2(&(xo[22]), T1q, ovs, &(xo[2]));
Chris@42 396 STN2(&(xo[20]), T1i, T1q, ovs);
Chris@42 397 }
Chris@42 398 }
Chris@42 399 }
Chris@42 400 }
Chris@42 401 }
Chris@42 402 }
Chris@42 403 VLEAVE();
Chris@42 404 }
Chris@42 405
Chris@42 406 static const kdft_desc desc = { 16, XSIMD_STRING("n2fv_16"), {68, 8, 4, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 407
Chris@42 408 void XSIMD(codelet_n2fv_16) (planner *p) {
Chris@42 409 X(kdft_register) (p, n2fv_16, &desc);
Chris@42 410 }
Chris@42 411
Chris@42 412 #endif /* HAVE_FMA */