annotate src/fftw-3.3.8/dft/simd/common/n2fv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:11 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n2fv_20 -with-ostride 2 -include dft/simd/n2f.h -store-multiple 2 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 104 FP additions, 50 FP multiplications,
Chris@82 32 * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
Chris@82 33 * 57 stack variables, 4 constants, and 50 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n2f.h"
Chris@82 36
Chris@82 37 static void n2fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT i;
Chris@82 45 const R *xi;
Chris@82 46 R *xo;
Chris@82 47 xi = ri;
Chris@82 48 xo = ro;
Chris@82 49 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@82 50 V T3, T1r, Tm, T13, TG, TN, TO, TH, T16, T19, T1a, T1v, T1w, T1x, T1s;
Chris@82 51 V T1t, T1u, T1d, T1g, T1h, Ti, TE, TB, TL;
Chris@82 52 {
Chris@82 53 V T1, T2, T11, Tk, Tl, T12;
Chris@82 54 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 55 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 56 T11 = VADD(T1, T2);
Chris@82 57 Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 58 Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 59 T12 = VADD(Tk, Tl);
Chris@82 60 T3 = VSUB(T1, T2);
Chris@82 61 T1r = VADD(T11, T12);
Chris@82 62 Tm = VSUB(Tk, Tl);
Chris@82 63 T13 = VSUB(T11, T12);
Chris@82 64 }
Chris@82 65 {
Chris@82 66 V T6, T14, Tw, T1c, Tz, T1f, T9, T17, Td, T1b, Tp, T15, Ts, T18, Tg;
Chris@82 67 V T1e;
Chris@82 68 {
Chris@82 69 V T4, T5, Tu, Tv;
Chris@82 70 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 71 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 72 T6 = VSUB(T4, T5);
Chris@82 73 T14 = VADD(T4, T5);
Chris@82 74 Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 75 Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 76 Tw = VSUB(Tu, Tv);
Chris@82 77 T1c = VADD(Tu, Tv);
Chris@82 78 }
Chris@82 79 {
Chris@82 80 V Tx, Ty, T7, T8;
Chris@82 81 Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 82 Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 83 Tz = VSUB(Tx, Ty);
Chris@82 84 T1f = VADD(Tx, Ty);
Chris@82 85 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 86 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 87 T9 = VSUB(T7, T8);
Chris@82 88 T17 = VADD(T7, T8);
Chris@82 89 }
Chris@82 90 {
Chris@82 91 V Tb, Tc, Tn, To;
Chris@82 92 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 93 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 94 Td = VSUB(Tb, Tc);
Chris@82 95 T1b = VADD(Tb, Tc);
Chris@82 96 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 97 To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 98 Tp = VSUB(Tn, To);
Chris@82 99 T15 = VADD(Tn, To);
Chris@82 100 }
Chris@82 101 {
Chris@82 102 V Tq, Tr, Te, Tf;
Chris@82 103 Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 104 Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 105 Ts = VSUB(Tq, Tr);
Chris@82 106 T18 = VADD(Tq, Tr);
Chris@82 107 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 108 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 109 Tg = VSUB(Te, Tf);
Chris@82 110 T1e = VADD(Te, Tf);
Chris@82 111 }
Chris@82 112 TG = VSUB(Ts, Tp);
Chris@82 113 TN = VSUB(T6, T9);
Chris@82 114 TO = VSUB(Td, Tg);
Chris@82 115 TH = VSUB(Tz, Tw);
Chris@82 116 T16 = VSUB(T14, T15);
Chris@82 117 T19 = VSUB(T17, T18);
Chris@82 118 T1a = VADD(T16, T19);
Chris@82 119 T1v = VADD(T1b, T1c);
Chris@82 120 T1w = VADD(T1e, T1f);
Chris@82 121 T1x = VADD(T1v, T1w);
Chris@82 122 T1s = VADD(T14, T15);
Chris@82 123 T1t = VADD(T17, T18);
Chris@82 124 T1u = VADD(T1s, T1t);
Chris@82 125 T1d = VSUB(T1b, T1c);
Chris@82 126 T1g = VSUB(T1e, T1f);
Chris@82 127 T1h = VADD(T1d, T1g);
Chris@82 128 {
Chris@82 129 V Ta, Th, Tt, TA;
Chris@82 130 Ta = VADD(T6, T9);
Chris@82 131 Th = VADD(Td, Tg);
Chris@82 132 Ti = VADD(Ta, Th);
Chris@82 133 TE = VSUB(Ta, Th);
Chris@82 134 Tt = VADD(Tp, Ts);
Chris@82 135 TA = VADD(Tw, Tz);
Chris@82 136 TB = VADD(Tt, TA);
Chris@82 137 TL = VSUB(TA, Tt);
Chris@82 138 }
Chris@82 139 }
Chris@82 140 {
Chris@82 141 V T1I, T1J, T1K, T1L, T1N, T1H, Tj, TC;
Chris@82 142 Tj = VADD(T3, Ti);
Chris@82 143 TC = VADD(Tm, TB);
Chris@82 144 T1H = VFNMSI(TC, Tj);
Chris@82 145 STM2(&(xo[10]), T1H, ovs, &(xo[2]));
Chris@82 146 T1I = VFMAI(TC, Tj);
Chris@82 147 STM2(&(xo[30]), T1I, ovs, &(xo[2]));
Chris@82 148 {
Chris@82 149 V T1A, T1y, T1z, T1E, T1G, T1C, T1D, T1F, T1B, T1M;
Chris@82 150 T1A = VSUB(T1u, T1x);
Chris@82 151 T1y = VADD(T1u, T1x);
Chris@82 152 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
Chris@82 153 T1C = VSUB(T1s, T1t);
Chris@82 154 T1D = VSUB(T1v, T1w);
Chris@82 155 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
Chris@82 156 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
Chris@82 157 T1J = VADD(T1r, T1y);
Chris@82 158 STM2(&(xo[0]), T1J, ovs, &(xo[0]));
Chris@82 159 T1F = VFNMS(LDK(KP559016994), T1A, T1z);
Chris@82 160 T1K = VFNMSI(T1G, T1F);
Chris@82 161 STM2(&(xo[16]), T1K, ovs, &(xo[0]));
Chris@82 162 T1L = VFMAI(T1G, T1F);
Chris@82 163 STM2(&(xo[24]), T1L, ovs, &(xo[0]));
Chris@82 164 T1B = VFMA(LDK(KP559016994), T1A, T1z);
Chris@82 165 T1M = VFMAI(T1E, T1B);
Chris@82 166 STM2(&(xo[8]), T1M, ovs, &(xo[0]));
Chris@82 167 STN2(&(xo[8]), T1M, T1H, ovs);
Chris@82 168 T1N = VFNMSI(T1E, T1B);
Chris@82 169 STM2(&(xo[32]), T1N, ovs, &(xo[0]));
Chris@82 170 }
Chris@82 171 {
Chris@82 172 V T1O, T1P, T1R, T1S;
Chris@82 173 {
Chris@82 174 V T1k, T1i, T1j, T1o, T1q, T1m, T1n, T1p, T1Q, T1l;
Chris@82 175 T1k = VSUB(T1a, T1h);
Chris@82 176 T1i = VADD(T1a, T1h);
Chris@82 177 T1j = VFNMS(LDK(KP250000000), T1i, T13);
Chris@82 178 T1m = VSUB(T1d, T1g);
Chris@82 179 T1n = VSUB(T16, T19);
Chris@82 180 T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
Chris@82 181 T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
Chris@82 182 T1O = VADD(T13, T1i);
Chris@82 183 STM2(&(xo[20]), T1O, ovs, &(xo[0]));
Chris@82 184 T1p = VFMA(LDK(KP559016994), T1k, T1j);
Chris@82 185 T1P = VFNMSI(T1q, T1p);
Chris@82 186 STM2(&(xo[12]), T1P, ovs, &(xo[0]));
Chris@82 187 T1Q = VFMAI(T1q, T1p);
Chris@82 188 STM2(&(xo[28]), T1Q, ovs, &(xo[0]));
Chris@82 189 STN2(&(xo[28]), T1Q, T1I, ovs);
Chris@82 190 T1l = VFNMS(LDK(KP559016994), T1k, T1j);
Chris@82 191 T1R = VFMAI(T1o, T1l);
Chris@82 192 STM2(&(xo[4]), T1R, ovs, &(xo[0]));
Chris@82 193 T1S = VFNMSI(T1o, T1l);
Chris@82 194 STM2(&(xo[36]), T1S, ovs, &(xo[0]));
Chris@82 195 }
Chris@82 196 {
Chris@82 197 V TI, TP, TX, TU, TM, TW, TF, TT, TK, TD;
Chris@82 198 TI = VFMA(LDK(KP618033988), TH, TG);
Chris@82 199 TP = VFMA(LDK(KP618033988), TO, TN);
Chris@82 200 TX = VFNMS(LDK(KP618033988), TN, TO);
Chris@82 201 TU = VFNMS(LDK(KP618033988), TG, TH);
Chris@82 202 TK = VFNMS(LDK(KP250000000), TB, Tm);
Chris@82 203 TM = VFNMS(LDK(KP559016994), TL, TK);
Chris@82 204 TW = VFMA(LDK(KP559016994), TL, TK);
Chris@82 205 TD = VFNMS(LDK(KP250000000), Ti, T3);
Chris@82 206 TF = VFMA(LDK(KP559016994), TE, TD);
Chris@82 207 TT = VFNMS(LDK(KP559016994), TE, TD);
Chris@82 208 {
Chris@82 209 V TJ, TQ, T1T, T1U;
Chris@82 210 TJ = VFMA(LDK(KP951056516), TI, TF);
Chris@82 211 TQ = VFMA(LDK(KP951056516), TP, TM);
Chris@82 212 T1T = VFNMSI(TQ, TJ);
Chris@82 213 STM2(&(xo[2]), T1T, ovs, &(xo[2]));
Chris@82 214 STN2(&(xo[0]), T1J, T1T, ovs);
Chris@82 215 T1U = VFMAI(TQ, TJ);
Chris@82 216 STM2(&(xo[38]), T1U, ovs, &(xo[2]));
Chris@82 217 STN2(&(xo[36]), T1S, T1U, ovs);
Chris@82 218 }
Chris@82 219 {
Chris@82 220 V TZ, T10, T1V, T1W;
Chris@82 221 TZ = VFMA(LDK(KP951056516), TU, TT);
Chris@82 222 T10 = VFMA(LDK(KP951056516), TX, TW);
Chris@82 223 T1V = VFNMSI(T10, TZ);
Chris@82 224 STM2(&(xo[26]), T1V, ovs, &(xo[2]));
Chris@82 225 STN2(&(xo[24]), T1L, T1V, ovs);
Chris@82 226 T1W = VFMAI(T10, TZ);
Chris@82 227 STM2(&(xo[14]), T1W, ovs, &(xo[2]));
Chris@82 228 STN2(&(xo[12]), T1P, T1W, ovs);
Chris@82 229 }
Chris@82 230 {
Chris@82 231 V TR, TS, T1X, T1Y;
Chris@82 232 TR = VFNMS(LDK(KP951056516), TI, TF);
Chris@82 233 TS = VFNMS(LDK(KP951056516), TP, TM);
Chris@82 234 T1X = VFNMSI(TS, TR);
Chris@82 235 STM2(&(xo[18]), T1X, ovs, &(xo[2]));
Chris@82 236 STN2(&(xo[16]), T1K, T1X, ovs);
Chris@82 237 T1Y = VFMAI(TS, TR);
Chris@82 238 STM2(&(xo[22]), T1Y, ovs, &(xo[2]));
Chris@82 239 STN2(&(xo[20]), T1O, T1Y, ovs);
Chris@82 240 }
Chris@82 241 {
Chris@82 242 V TV, TY, T1Z, T20;
Chris@82 243 TV = VFNMS(LDK(KP951056516), TU, TT);
Chris@82 244 TY = VFNMS(LDK(KP951056516), TX, TW);
Chris@82 245 T1Z = VFNMSI(TY, TV);
Chris@82 246 STM2(&(xo[34]), T1Z, ovs, &(xo[2]));
Chris@82 247 STN2(&(xo[32]), T1N, T1Z, ovs);
Chris@82 248 T20 = VFMAI(TY, TV);
Chris@82 249 STM2(&(xo[6]), T20, ovs, &(xo[2]));
Chris@82 250 STN2(&(xo[4]), T1R, T20, ovs);
Chris@82 251 }
Chris@82 252 }
Chris@82 253 }
Chris@82 254 }
Chris@82 255 }
Chris@82 256 }
Chris@82 257 VLEAVE();
Chris@82 258 }
Chris@82 259
Chris@82 260 static const kdft_desc desc = { 20, XSIMD_STRING("n2fv_20"), {58, 4, 46, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 261
Chris@82 262 void XSIMD(codelet_n2fv_20) (planner *p) {
Chris@82 263 X(kdft_register) (p, n2fv_20, &desc);
Chris@82 264 }
Chris@82 265
Chris@82 266 #else
Chris@82 267
Chris@82 268 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n2fv_20 -with-ostride 2 -include dft/simd/n2f.h -store-multiple 2 */
Chris@82 269
Chris@82 270 /*
Chris@82 271 * This function contains 104 FP additions, 24 FP multiplications,
Chris@82 272 * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
Chris@82 273 * 57 stack variables, 4 constants, and 50 memory accesses
Chris@82 274 */
Chris@82 275 #include "dft/simd/n2f.h"
Chris@82 276
Chris@82 277 static void n2fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 278 {
Chris@82 279 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 280 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 281 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 282 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 283 {
Chris@82 284 INT i;
Chris@82 285 const R *xi;
Chris@82 286 R *xo;
Chris@82 287 xi = ri;
Chris@82 288 xo = ro;
Chris@82 289 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@82 290 V T3, T1B, Tm, T1i, TG, TN, TO, TH, T13, T16, T1k, T1u, T1v, T1z, T1r;
Chris@82 291 V T1s, T1y, T1a, T1d, T1j, Ti, TD, TB, TL;
Chris@82 292 {
Chris@82 293 V T1, T2, T1g, Tk, Tl, T1h;
Chris@82 294 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 295 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 296 T1g = VADD(T1, T2);
Chris@82 297 Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 298 Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 299 T1h = VADD(Tk, Tl);
Chris@82 300 T3 = VSUB(T1, T2);
Chris@82 301 T1B = VADD(T1g, T1h);
Chris@82 302 Tm = VSUB(Tk, Tl);
Chris@82 303 T1i = VSUB(T1g, T1h);
Chris@82 304 }
Chris@82 305 {
Chris@82 306 V T6, T18, Tw, T12, Tz, T15, T9, T1b, Td, T11, Tp, T19, Ts, T1c, Tg;
Chris@82 307 V T14;
Chris@82 308 {
Chris@82 309 V T4, T5, Tu, Tv;
Chris@82 310 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 311 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 312 T6 = VSUB(T4, T5);
Chris@82 313 T18 = VADD(T4, T5);
Chris@82 314 Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 315 Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 316 Tw = VSUB(Tu, Tv);
Chris@82 317 T12 = VADD(Tu, Tv);
Chris@82 318 }
Chris@82 319 {
Chris@82 320 V Tx, Ty, T7, T8;
Chris@82 321 Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 322 Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 323 Tz = VSUB(Tx, Ty);
Chris@82 324 T15 = VADD(Tx, Ty);
Chris@82 325 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 326 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 327 T9 = VSUB(T7, T8);
Chris@82 328 T1b = VADD(T7, T8);
Chris@82 329 }
Chris@82 330 {
Chris@82 331 V Tb, Tc, Tn, To;
Chris@82 332 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 333 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 334 Td = VSUB(Tb, Tc);
Chris@82 335 T11 = VADD(Tb, Tc);
Chris@82 336 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 337 To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 338 Tp = VSUB(Tn, To);
Chris@82 339 T19 = VADD(Tn, To);
Chris@82 340 }
Chris@82 341 {
Chris@82 342 V Tq, Tr, Te, Tf;
Chris@82 343 Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 344 Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 345 Ts = VSUB(Tq, Tr);
Chris@82 346 T1c = VADD(Tq, Tr);
Chris@82 347 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 348 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 349 Tg = VSUB(Te, Tf);
Chris@82 350 T14 = VADD(Te, Tf);
Chris@82 351 }
Chris@82 352 TG = VSUB(Ts, Tp);
Chris@82 353 TN = VSUB(T6, T9);
Chris@82 354 TO = VSUB(Td, Tg);
Chris@82 355 TH = VSUB(Tz, Tw);
Chris@82 356 T13 = VSUB(T11, T12);
Chris@82 357 T16 = VSUB(T14, T15);
Chris@82 358 T1k = VADD(T13, T16);
Chris@82 359 T1u = VADD(T11, T12);
Chris@82 360 T1v = VADD(T14, T15);
Chris@82 361 T1z = VADD(T1u, T1v);
Chris@82 362 T1r = VADD(T18, T19);
Chris@82 363 T1s = VADD(T1b, T1c);
Chris@82 364 T1y = VADD(T1r, T1s);
Chris@82 365 T1a = VSUB(T18, T19);
Chris@82 366 T1d = VSUB(T1b, T1c);
Chris@82 367 T1j = VADD(T1a, T1d);
Chris@82 368 {
Chris@82 369 V Ta, Th, Tt, TA;
Chris@82 370 Ta = VADD(T6, T9);
Chris@82 371 Th = VADD(Td, Tg);
Chris@82 372 Ti = VADD(Ta, Th);
Chris@82 373 TD = VMUL(LDK(KP559016994), VSUB(Ta, Th));
Chris@82 374 Tt = VADD(Tp, Ts);
Chris@82 375 TA = VADD(Tw, Tz);
Chris@82 376 TB = VADD(Tt, TA);
Chris@82 377 TL = VMUL(LDK(KP559016994), VSUB(TA, Tt));
Chris@82 378 }
Chris@82 379 }
Chris@82 380 {
Chris@82 381 V T1I, T1J, T1K, T1L, T1N, T1H, Tj, TC;
Chris@82 382 Tj = VADD(T3, Ti);
Chris@82 383 TC = VBYI(VADD(Tm, TB));
Chris@82 384 T1H = VSUB(Tj, TC);
Chris@82 385 STM2(&(xo[10]), T1H, ovs, &(xo[2]));
Chris@82 386 T1I = VADD(Tj, TC);
Chris@82 387 STM2(&(xo[30]), T1I, ovs, &(xo[2]));
Chris@82 388 {
Chris@82 389 V T1A, T1C, T1D, T1x, T1G, T1t, T1w, T1F, T1E, T1M;
Chris@82 390 T1A = VMUL(LDK(KP559016994), VSUB(T1y, T1z));
Chris@82 391 T1C = VADD(T1y, T1z);
Chris@82 392 T1D = VFNMS(LDK(KP250000000), T1C, T1B);
Chris@82 393 T1t = VSUB(T1r, T1s);
Chris@82 394 T1w = VSUB(T1u, T1v);
Chris@82 395 T1x = VBYI(VFMA(LDK(KP951056516), T1t, VMUL(LDK(KP587785252), T1w)));
Chris@82 396 T1G = VBYI(VFNMS(LDK(KP587785252), T1t, VMUL(LDK(KP951056516), T1w)));
Chris@82 397 T1J = VADD(T1B, T1C);
Chris@82 398 STM2(&(xo[0]), T1J, ovs, &(xo[0]));
Chris@82 399 T1F = VSUB(T1D, T1A);
Chris@82 400 T1K = VSUB(T1F, T1G);
Chris@82 401 STM2(&(xo[16]), T1K, ovs, &(xo[0]));
Chris@82 402 T1L = VADD(T1G, T1F);
Chris@82 403 STM2(&(xo[24]), T1L, ovs, &(xo[0]));
Chris@82 404 T1E = VADD(T1A, T1D);
Chris@82 405 T1M = VADD(T1x, T1E);
Chris@82 406 STM2(&(xo[8]), T1M, ovs, &(xo[0]));
Chris@82 407 STN2(&(xo[8]), T1M, T1H, ovs);
Chris@82 408 T1N = VSUB(T1E, T1x);
Chris@82 409 STM2(&(xo[32]), T1N, ovs, &(xo[0]));
Chris@82 410 }
Chris@82 411 {
Chris@82 412 V T1O, T1P, T1R, T1S;
Chris@82 413 {
Chris@82 414 V T1n, T1l, T1m, T1f, T1q, T17, T1e, T1p, T1Q, T1o;
Chris@82 415 T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
Chris@82 416 T1l = VADD(T1j, T1k);
Chris@82 417 T1m = VFNMS(LDK(KP250000000), T1l, T1i);
Chris@82 418 T17 = VSUB(T13, T16);
Chris@82 419 T1e = VSUB(T1a, T1d);
Chris@82 420 T1f = VBYI(VFNMS(LDK(KP587785252), T1e, VMUL(LDK(KP951056516), T17)));
Chris@82 421 T1q = VBYI(VFMA(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
Chris@82 422 T1O = VADD(T1i, T1l);
Chris@82 423 STM2(&(xo[20]), T1O, ovs, &(xo[0]));
Chris@82 424 T1p = VADD(T1n, T1m);
Chris@82 425 T1P = VSUB(T1p, T1q);
Chris@82 426 STM2(&(xo[12]), T1P, ovs, &(xo[0]));
Chris@82 427 T1Q = VADD(T1q, T1p);
Chris@82 428 STM2(&(xo[28]), T1Q, ovs, &(xo[0]));
Chris@82 429 STN2(&(xo[28]), T1Q, T1I, ovs);
Chris@82 430 T1o = VSUB(T1m, T1n);
Chris@82 431 T1R = VADD(T1f, T1o);
Chris@82 432 STM2(&(xo[4]), T1R, ovs, &(xo[0]));
Chris@82 433 T1S = VSUB(T1o, T1f);
Chris@82 434 STM2(&(xo[36]), T1S, ovs, &(xo[0]));
Chris@82 435 }
Chris@82 436 {
Chris@82 437 V TI, TP, TX, TU, TM, TW, TF, TT, TK, TE;
Chris@82 438 TI = VFMA(LDK(KP951056516), TG, VMUL(LDK(KP587785252), TH));
Chris@82 439 TP = VFMA(LDK(KP951056516), TN, VMUL(LDK(KP587785252), TO));
Chris@82 440 TX = VFNMS(LDK(KP587785252), TN, VMUL(LDK(KP951056516), TO));
Chris@82 441 TU = VFNMS(LDK(KP587785252), TG, VMUL(LDK(KP951056516), TH));
Chris@82 442 TK = VFMS(LDK(KP250000000), TB, Tm);
Chris@82 443 TM = VADD(TK, TL);
Chris@82 444 TW = VSUB(TL, TK);
Chris@82 445 TE = VFNMS(LDK(KP250000000), Ti, T3);
Chris@82 446 TF = VADD(TD, TE);
Chris@82 447 TT = VSUB(TE, TD);
Chris@82 448 {
Chris@82 449 V TJ, TQ, T1T, T1U;
Chris@82 450 TJ = VADD(TF, TI);
Chris@82 451 TQ = VBYI(VSUB(TM, TP));
Chris@82 452 T1T = VSUB(TJ, TQ);
Chris@82 453 STM2(&(xo[38]), T1T, ovs, &(xo[2]));
Chris@82 454 STN2(&(xo[36]), T1S, T1T, ovs);
Chris@82 455 T1U = VADD(TJ, TQ);
Chris@82 456 STM2(&(xo[2]), T1U, ovs, &(xo[2]));
Chris@82 457 STN2(&(xo[0]), T1J, T1U, ovs);
Chris@82 458 }
Chris@82 459 {
Chris@82 460 V TZ, T10, T1V, T1W;
Chris@82 461 TZ = VADD(TT, TU);
Chris@82 462 T10 = VBYI(VADD(TX, TW));
Chris@82 463 T1V = VSUB(TZ, T10);
Chris@82 464 STM2(&(xo[26]), T1V, ovs, &(xo[2]));
Chris@82 465 STN2(&(xo[24]), T1L, T1V, ovs);
Chris@82 466 T1W = VADD(TZ, T10);
Chris@82 467 STM2(&(xo[14]), T1W, ovs, &(xo[2]));
Chris@82 468 STN2(&(xo[12]), T1P, T1W, ovs);
Chris@82 469 }
Chris@82 470 {
Chris@82 471 V TR, TS, T1X, T1Y;
Chris@82 472 TR = VSUB(TF, TI);
Chris@82 473 TS = VBYI(VADD(TP, TM));
Chris@82 474 T1X = VSUB(TR, TS);
Chris@82 475 STM2(&(xo[22]), T1X, ovs, &(xo[2]));
Chris@82 476 STN2(&(xo[20]), T1O, T1X, ovs);
Chris@82 477 T1Y = VADD(TR, TS);
Chris@82 478 STM2(&(xo[18]), T1Y, ovs, &(xo[2]));
Chris@82 479 STN2(&(xo[16]), T1K, T1Y, ovs);
Chris@82 480 }
Chris@82 481 {
Chris@82 482 V TV, TY, T1Z, T20;
Chris@82 483 TV = VSUB(TT, TU);
Chris@82 484 TY = VBYI(VSUB(TW, TX));
Chris@82 485 T1Z = VSUB(TV, TY);
Chris@82 486 STM2(&(xo[34]), T1Z, ovs, &(xo[2]));
Chris@82 487 STN2(&(xo[32]), T1N, T1Z, ovs);
Chris@82 488 T20 = VADD(TV, TY);
Chris@82 489 STM2(&(xo[6]), T20, ovs, &(xo[2]));
Chris@82 490 STN2(&(xo[4]), T1R, T20, ovs);
Chris@82 491 }
Chris@82 492 }
Chris@82 493 }
Chris@82 494 }
Chris@82 495 }
Chris@82 496 }
Chris@82 497 VLEAVE();
Chris@82 498 }
Chris@82 499
Chris@82 500 static const kdft_desc desc = { 20, XSIMD_STRING("n2fv_20"), {92, 12, 12, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 501
Chris@82 502 void XSIMD(codelet_n2fv_20) (planner *p) {
Chris@82 503 X(kdft_register) (p, n2fv_20, &desc);
Chris@82 504 }
Chris@82 505
Chris@82 506 #endif