annotate src/fftw-3.3.8/dft/simd/common/n2bv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:19 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n2bv_20 -with-ostride 2 -include dft/simd/n2b.h -store-multiple 2 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 104 FP additions, 50 FP multiplications,
Chris@82 32 * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
Chris@82 33 * 57 stack variables, 4 constants, and 50 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n2b.h"
Chris@82 36
Chris@82 37 static void n2bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT i;
Chris@82 45 const R *xi;
Chris@82 46 R *xo;
Chris@82 47 xi = ii;
Chris@82 48 xo = io;
Chris@82 49 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@82 50 V T3, T1r, TE, T13, Ts, TL, TM, Tz, T16, T19, T1a, T1v, T1w, T1x, T1s;
Chris@82 51 V T1t, T1u, T1d, T1g, T1h, Ti, Tk, TH, TJ;
Chris@82 52 {
Chris@82 53 V T1, T2, T11, TC, TD, T12;
Chris@82 54 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 55 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 56 T11 = VADD(T1, T2);
Chris@82 57 TC = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 58 TD = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 59 T12 = VADD(TC, TD);
Chris@82 60 T3 = VSUB(T1, T2);
Chris@82 61 T1r = VADD(T11, T12);
Chris@82 62 TE = VSUB(TC, TD);
Chris@82 63 T13 = VSUB(T11, T12);
Chris@82 64 }
Chris@82 65 {
Chris@82 66 V T6, T14, Tv, T1c, Ty, T1f, T9, T17, Td, T1b, To, T15, Tr, T18, Tg;
Chris@82 67 V T1e;
Chris@82 68 {
Chris@82 69 V T4, T5, Tt, Tu;
Chris@82 70 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 71 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 72 T6 = VSUB(T4, T5);
Chris@82 73 T14 = VADD(T4, T5);
Chris@82 74 Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 75 Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 76 Tv = VSUB(Tt, Tu);
Chris@82 77 T1c = VADD(Tt, Tu);
Chris@82 78 }
Chris@82 79 {
Chris@82 80 V Tw, Tx, T7, T8;
Chris@82 81 Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 82 Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 83 Ty = VSUB(Tw, Tx);
Chris@82 84 T1f = VADD(Tw, Tx);
Chris@82 85 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 86 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 87 T9 = VSUB(T7, T8);
Chris@82 88 T17 = VADD(T7, T8);
Chris@82 89 }
Chris@82 90 {
Chris@82 91 V Tb, Tc, Tm, Tn;
Chris@82 92 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 93 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 94 Td = VSUB(Tb, Tc);
Chris@82 95 T1b = VADD(Tb, Tc);
Chris@82 96 Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 97 Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 98 To = VSUB(Tm, Tn);
Chris@82 99 T15 = VADD(Tm, Tn);
Chris@82 100 }
Chris@82 101 {
Chris@82 102 V Tp, Tq, Te, Tf;
Chris@82 103 Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 104 Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 105 Tr = VSUB(Tp, Tq);
Chris@82 106 T18 = VADD(Tp, Tq);
Chris@82 107 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 108 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 109 Tg = VSUB(Te, Tf);
Chris@82 110 T1e = VADD(Te, Tf);
Chris@82 111 }
Chris@82 112 Ts = VSUB(To, Tr);
Chris@82 113 TL = VSUB(T6, T9);
Chris@82 114 TM = VSUB(Td, Tg);
Chris@82 115 Tz = VSUB(Tv, Ty);
Chris@82 116 T16 = VSUB(T14, T15);
Chris@82 117 T19 = VSUB(T17, T18);
Chris@82 118 T1a = VADD(T16, T19);
Chris@82 119 T1v = VADD(T1b, T1c);
Chris@82 120 T1w = VADD(T1e, T1f);
Chris@82 121 T1x = VADD(T1v, T1w);
Chris@82 122 T1s = VADD(T14, T15);
Chris@82 123 T1t = VADD(T17, T18);
Chris@82 124 T1u = VADD(T1s, T1t);
Chris@82 125 T1d = VSUB(T1b, T1c);
Chris@82 126 T1g = VSUB(T1e, T1f);
Chris@82 127 T1h = VADD(T1d, T1g);
Chris@82 128 {
Chris@82 129 V Ta, Th, TF, TG;
Chris@82 130 Ta = VADD(T6, T9);
Chris@82 131 Th = VADD(Td, Tg);
Chris@82 132 Ti = VADD(Ta, Th);
Chris@82 133 Tk = VSUB(Ta, Th);
Chris@82 134 TF = VADD(To, Tr);
Chris@82 135 TG = VADD(Tv, Ty);
Chris@82 136 TH = VADD(TF, TG);
Chris@82 137 TJ = VSUB(TF, TG);
Chris@82 138 }
Chris@82 139 }
Chris@82 140 {
Chris@82 141 V T1H, T1J, T1K, T1L, T1N, T1I, TZ, T10;
Chris@82 142 TZ = VADD(T3, Ti);
Chris@82 143 T10 = VADD(TE, TH);
Chris@82 144 T1H = VFNMSI(T10, TZ);
Chris@82 145 STM2(&(xo[30]), T1H, ovs, &(xo[2]));
Chris@82 146 T1I = VFMAI(T10, TZ);
Chris@82 147 STM2(&(xo[10]), T1I, ovs, &(xo[2]));
Chris@82 148 {
Chris@82 149 V T1A, T1y, T1z, T1E, T1G, T1C, T1D, T1F, T1B, T1M;
Chris@82 150 T1A = VSUB(T1u, T1x);
Chris@82 151 T1y = VADD(T1u, T1x);
Chris@82 152 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
Chris@82 153 T1C = VSUB(T1s, T1t);
Chris@82 154 T1D = VSUB(T1v, T1w);
Chris@82 155 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
Chris@82 156 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
Chris@82 157 T1J = VADD(T1r, T1y);
Chris@82 158 STM2(&(xo[0]), T1J, ovs, &(xo[0]));
Chris@82 159 T1F = VFNMS(LDK(KP559016994), T1A, T1z);
Chris@82 160 T1K = VFMAI(T1G, T1F);
Chris@82 161 STM2(&(xo[16]), T1K, ovs, &(xo[0]));
Chris@82 162 T1L = VFNMSI(T1G, T1F);
Chris@82 163 STM2(&(xo[24]), T1L, ovs, &(xo[0]));
Chris@82 164 T1B = VFMA(LDK(KP559016994), T1A, T1z);
Chris@82 165 T1M = VFNMSI(T1E, T1B);
Chris@82 166 STM2(&(xo[8]), T1M, ovs, &(xo[0]));
Chris@82 167 STN2(&(xo[8]), T1M, T1I, ovs);
Chris@82 168 T1N = VFMAI(T1E, T1B);
Chris@82 169 STM2(&(xo[32]), T1N, ovs, &(xo[0]));
Chris@82 170 }
Chris@82 171 {
Chris@82 172 V T1O, T1P, T1R, T1S;
Chris@82 173 {
Chris@82 174 V T1k, T1i, T1j, T1o, T1q, T1m, T1n, T1p, T1Q, T1l;
Chris@82 175 T1k = VSUB(T1a, T1h);
Chris@82 176 T1i = VADD(T1a, T1h);
Chris@82 177 T1j = VFNMS(LDK(KP250000000), T1i, T13);
Chris@82 178 T1m = VSUB(T1d, T1g);
Chris@82 179 T1n = VSUB(T16, T19);
Chris@82 180 T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
Chris@82 181 T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
Chris@82 182 T1O = VADD(T13, T1i);
Chris@82 183 STM2(&(xo[20]), T1O, ovs, &(xo[0]));
Chris@82 184 T1p = VFMA(LDK(KP559016994), T1k, T1j);
Chris@82 185 T1P = VFMAI(T1q, T1p);
Chris@82 186 STM2(&(xo[12]), T1P, ovs, &(xo[0]));
Chris@82 187 T1Q = VFNMSI(T1q, T1p);
Chris@82 188 STM2(&(xo[28]), T1Q, ovs, &(xo[0]));
Chris@82 189 STN2(&(xo[28]), T1Q, T1H, ovs);
Chris@82 190 T1l = VFNMS(LDK(KP559016994), T1k, T1j);
Chris@82 191 T1R = VFNMSI(T1o, T1l);
Chris@82 192 STM2(&(xo[4]), T1R, ovs, &(xo[0]));
Chris@82 193 T1S = VFMAI(T1o, T1l);
Chris@82 194 STM2(&(xo[36]), T1S, ovs, &(xo[0]));
Chris@82 195 }
Chris@82 196 {
Chris@82 197 V TA, TN, TV, TS, TK, TU, Tl, TR, TI, Tj;
Chris@82 198 TA = VFMA(LDK(KP618033988), Tz, Ts);
Chris@82 199 TN = VFMA(LDK(KP618033988), TM, TL);
Chris@82 200 TV = VFNMS(LDK(KP618033988), TL, TM);
Chris@82 201 TS = VFNMS(LDK(KP618033988), Ts, Tz);
Chris@82 202 TI = VFNMS(LDK(KP250000000), TH, TE);
Chris@82 203 TK = VFMA(LDK(KP559016994), TJ, TI);
Chris@82 204 TU = VFNMS(LDK(KP559016994), TJ, TI);
Chris@82 205 Tj = VFNMS(LDK(KP250000000), Ti, T3);
Chris@82 206 Tl = VFMA(LDK(KP559016994), Tk, Tj);
Chris@82 207 TR = VFNMS(LDK(KP559016994), Tk, Tj);
Chris@82 208 {
Chris@82 209 V TB, TO, T1T, T1U;
Chris@82 210 TB = VFNMS(LDK(KP951056516), TA, Tl);
Chris@82 211 TO = VFMA(LDK(KP951056516), TN, TK);
Chris@82 212 T1T = VFNMSI(TO, TB);
Chris@82 213 STM2(&(xo[38]), T1T, ovs, &(xo[2]));
Chris@82 214 STN2(&(xo[36]), T1S, T1T, ovs);
Chris@82 215 T1U = VFMAI(TO, TB);
Chris@82 216 STM2(&(xo[2]), T1U, ovs, &(xo[2]));
Chris@82 217 STN2(&(xo[0]), T1J, T1U, ovs);
Chris@82 218 }
Chris@82 219 {
Chris@82 220 V TX, TY, T1V, T1W;
Chris@82 221 TX = VFNMS(LDK(KP951056516), TS, TR);
Chris@82 222 TY = VFMA(LDK(KP951056516), TV, TU);
Chris@82 223 T1V = VFNMSI(TY, TX);
Chris@82 224 STM2(&(xo[14]), T1V, ovs, &(xo[2]));
Chris@82 225 STN2(&(xo[12]), T1P, T1V, ovs);
Chris@82 226 T1W = VFMAI(TY, TX);
Chris@82 227 STM2(&(xo[26]), T1W, ovs, &(xo[2]));
Chris@82 228 STN2(&(xo[24]), T1L, T1W, ovs);
Chris@82 229 }
Chris@82 230 {
Chris@82 231 V TP, TQ, T1X, T1Y;
Chris@82 232 TP = VFMA(LDK(KP951056516), TA, Tl);
Chris@82 233 TQ = VFNMS(LDK(KP951056516), TN, TK);
Chris@82 234 T1X = VFNMSI(TQ, TP);
Chris@82 235 STM2(&(xo[22]), T1X, ovs, &(xo[2]));
Chris@82 236 STN2(&(xo[20]), T1O, T1X, ovs);
Chris@82 237 T1Y = VFMAI(TQ, TP);
Chris@82 238 STM2(&(xo[18]), T1Y, ovs, &(xo[2]));
Chris@82 239 STN2(&(xo[16]), T1K, T1Y, ovs);
Chris@82 240 }
Chris@82 241 {
Chris@82 242 V TT, TW, T1Z, T20;
Chris@82 243 TT = VFMA(LDK(KP951056516), TS, TR);
Chris@82 244 TW = VFNMS(LDK(KP951056516), TV, TU);
Chris@82 245 T1Z = VFNMSI(TW, TT);
Chris@82 246 STM2(&(xo[6]), T1Z, ovs, &(xo[2]));
Chris@82 247 STN2(&(xo[4]), T1R, T1Z, ovs);
Chris@82 248 T20 = VFMAI(TW, TT);
Chris@82 249 STM2(&(xo[34]), T20, ovs, &(xo[2]));
Chris@82 250 STN2(&(xo[32]), T1N, T20, ovs);
Chris@82 251 }
Chris@82 252 }
Chris@82 253 }
Chris@82 254 }
Chris@82 255 }
Chris@82 256 }
Chris@82 257 VLEAVE();
Chris@82 258 }
Chris@82 259
Chris@82 260 static const kdft_desc desc = { 20, XSIMD_STRING("n2bv_20"), {58, 4, 46, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 261
Chris@82 262 void XSIMD(codelet_n2bv_20) (planner *p) {
Chris@82 263 X(kdft_register) (p, n2bv_20, &desc);
Chris@82 264 }
Chris@82 265
Chris@82 266 #else
Chris@82 267
Chris@82 268 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n2bv_20 -with-ostride 2 -include dft/simd/n2b.h -store-multiple 2 */
Chris@82 269
Chris@82 270 /*
Chris@82 271 * This function contains 104 FP additions, 24 FP multiplications,
Chris@82 272 * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
Chris@82 273 * 57 stack variables, 4 constants, and 50 memory accesses
Chris@82 274 */
Chris@82 275 #include "dft/simd/n2b.h"
Chris@82 276
Chris@82 277 static void n2bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 278 {
Chris@82 279 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 280 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 281 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 282 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 283 {
Chris@82 284 INT i;
Chris@82 285 const R *xi;
Chris@82 286 R *xo;
Chris@82 287 xi = ii;
Chris@82 288 xo = io;
Chris@82 289 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@82 290 V T3, T1y, TH, T1i, Ts, TL, TM, Tz, T13, T16, T1j, T1u, T1v, T1w, T1r;
Chris@82 291 V T1s, T1t, T1a, T1d, T1k, Ti, Tk, TE, TI;
Chris@82 292 {
Chris@82 293 V T1, T2, T1g, TF, TG, T1h;
Chris@82 294 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 295 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 296 T1g = VADD(T1, T2);
Chris@82 297 TF = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 298 TG = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 299 T1h = VADD(TF, TG);
Chris@82 300 T3 = VSUB(T1, T2);
Chris@82 301 T1y = VADD(T1g, T1h);
Chris@82 302 TH = VSUB(TF, TG);
Chris@82 303 T1i = VSUB(T1g, T1h);
Chris@82 304 }
Chris@82 305 {
Chris@82 306 V T6, T11, Tv, T19, Ty, T1c, T9, T14, Td, T18, To, T12, Tr, T15, Tg;
Chris@82 307 V T1b;
Chris@82 308 {
Chris@82 309 V T4, T5, Tt, Tu;
Chris@82 310 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 311 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 312 T6 = VSUB(T4, T5);
Chris@82 313 T11 = VADD(T4, T5);
Chris@82 314 Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 315 Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 316 Tv = VSUB(Tt, Tu);
Chris@82 317 T19 = VADD(Tt, Tu);
Chris@82 318 }
Chris@82 319 {
Chris@82 320 V Tw, Tx, T7, T8;
Chris@82 321 Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 322 Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 323 Ty = VSUB(Tw, Tx);
Chris@82 324 T1c = VADD(Tw, Tx);
Chris@82 325 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 326 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 327 T9 = VSUB(T7, T8);
Chris@82 328 T14 = VADD(T7, T8);
Chris@82 329 }
Chris@82 330 {
Chris@82 331 V Tb, Tc, Tm, Tn;
Chris@82 332 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 333 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 334 Td = VSUB(Tb, Tc);
Chris@82 335 T18 = VADD(Tb, Tc);
Chris@82 336 Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 337 Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 338 To = VSUB(Tm, Tn);
Chris@82 339 T12 = VADD(Tm, Tn);
Chris@82 340 }
Chris@82 341 {
Chris@82 342 V Tp, Tq, Te, Tf;
Chris@82 343 Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 344 Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 345 Tr = VSUB(Tp, Tq);
Chris@82 346 T15 = VADD(Tp, Tq);
Chris@82 347 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 348 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 349 Tg = VSUB(Te, Tf);
Chris@82 350 T1b = VADD(Te, Tf);
Chris@82 351 }
Chris@82 352 Ts = VSUB(To, Tr);
Chris@82 353 TL = VSUB(T6, T9);
Chris@82 354 TM = VSUB(Td, Tg);
Chris@82 355 Tz = VSUB(Tv, Ty);
Chris@82 356 T13 = VSUB(T11, T12);
Chris@82 357 T16 = VSUB(T14, T15);
Chris@82 358 T1j = VADD(T13, T16);
Chris@82 359 T1u = VADD(T18, T19);
Chris@82 360 T1v = VADD(T1b, T1c);
Chris@82 361 T1w = VADD(T1u, T1v);
Chris@82 362 T1r = VADD(T11, T12);
Chris@82 363 T1s = VADD(T14, T15);
Chris@82 364 T1t = VADD(T1r, T1s);
Chris@82 365 T1a = VSUB(T18, T19);
Chris@82 366 T1d = VSUB(T1b, T1c);
Chris@82 367 T1k = VADD(T1a, T1d);
Chris@82 368 {
Chris@82 369 V Ta, Th, TC, TD;
Chris@82 370 Ta = VADD(T6, T9);
Chris@82 371 Th = VADD(Td, Tg);
Chris@82 372 Ti = VADD(Ta, Th);
Chris@82 373 Tk = VMUL(LDK(KP559016994), VSUB(Ta, Th));
Chris@82 374 TC = VADD(To, Tr);
Chris@82 375 TD = VADD(Tv, Ty);
Chris@82 376 TE = VMUL(LDK(KP559016994), VSUB(TC, TD));
Chris@82 377 TI = VADD(TC, TD);
Chris@82 378 }
Chris@82 379 }
Chris@82 380 {
Chris@82 381 V T1H, T1J, T1K, T1L, T1N, T1I, TZ, T10;
Chris@82 382 TZ = VADD(T3, Ti);
Chris@82 383 T10 = VBYI(VADD(TH, TI));
Chris@82 384 T1H = VSUB(TZ, T10);
Chris@82 385 STM2(&(xo[30]), T1H, ovs, &(xo[2]));
Chris@82 386 T1I = VADD(TZ, T10);
Chris@82 387 STM2(&(xo[10]), T1I, ovs, &(xo[2]));
Chris@82 388 {
Chris@82 389 V T1x, T1z, T1A, T1E, T1G, T1C, T1D, T1F, T1B, T1M;
Chris@82 390 T1x = VMUL(LDK(KP559016994), VSUB(T1t, T1w));
Chris@82 391 T1z = VADD(T1t, T1w);
Chris@82 392 T1A = VFNMS(LDK(KP250000000), T1z, T1y);
Chris@82 393 T1C = VSUB(T1r, T1s);
Chris@82 394 T1D = VSUB(T1u, T1v);
Chris@82 395 T1E = VBYI(VFMA(LDK(KP951056516), T1C, VMUL(LDK(KP587785252), T1D)));
Chris@82 396 T1G = VBYI(VFNMS(LDK(KP951056516), T1D, VMUL(LDK(KP587785252), T1C)));
Chris@82 397 T1J = VADD(T1y, T1z);
Chris@82 398 STM2(&(xo[0]), T1J, ovs, &(xo[0]));
Chris@82 399 T1F = VSUB(T1A, T1x);
Chris@82 400 T1K = VSUB(T1F, T1G);
Chris@82 401 STM2(&(xo[16]), T1K, ovs, &(xo[0]));
Chris@82 402 T1L = VADD(T1G, T1F);
Chris@82 403 STM2(&(xo[24]), T1L, ovs, &(xo[0]));
Chris@82 404 T1B = VADD(T1x, T1A);
Chris@82 405 T1M = VSUB(T1B, T1E);
Chris@82 406 STM2(&(xo[8]), T1M, ovs, &(xo[0]));
Chris@82 407 STN2(&(xo[8]), T1M, T1I, ovs);
Chris@82 408 T1N = VADD(T1E, T1B);
Chris@82 409 STM2(&(xo[32]), T1N, ovs, &(xo[0]));
Chris@82 410 }
Chris@82 411 {
Chris@82 412 V T1O, T1P, T1R, T1S;
Chris@82 413 {
Chris@82 414 V T1n, T1l, T1m, T1f, T1p, T17, T1e, T1q, T1Q, T1o;
Chris@82 415 T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
Chris@82 416 T1l = VADD(T1j, T1k);
Chris@82 417 T1m = VFNMS(LDK(KP250000000), T1l, T1i);
Chris@82 418 T17 = VSUB(T13, T16);
Chris@82 419 T1e = VSUB(T1a, T1d);
Chris@82 420 T1f = VBYI(VFNMS(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
Chris@82 421 T1p = VBYI(VFMA(LDK(KP951056516), T17, VMUL(LDK(KP587785252), T1e)));
Chris@82 422 T1O = VADD(T1i, T1l);
Chris@82 423 STM2(&(xo[20]), T1O, ovs, &(xo[0]));
Chris@82 424 T1q = VADD(T1n, T1m);
Chris@82 425 T1P = VADD(T1p, T1q);
Chris@82 426 STM2(&(xo[12]), T1P, ovs, &(xo[0]));
Chris@82 427 T1Q = VSUB(T1q, T1p);
Chris@82 428 STM2(&(xo[28]), T1Q, ovs, &(xo[0]));
Chris@82 429 STN2(&(xo[28]), T1Q, T1H, ovs);
Chris@82 430 T1o = VSUB(T1m, T1n);
Chris@82 431 T1R = VADD(T1f, T1o);
Chris@82 432 STM2(&(xo[4]), T1R, ovs, &(xo[0]));
Chris@82 433 T1S = VSUB(T1o, T1f);
Chris@82 434 STM2(&(xo[36]), T1S, ovs, &(xo[0]));
Chris@82 435 }
Chris@82 436 {
Chris@82 437 V TA, TN, TU, TS, TK, TV, Tl, TR, TJ, Tj;
Chris@82 438 TA = VFNMS(LDK(KP951056516), Tz, VMUL(LDK(KP587785252), Ts));
Chris@82 439 TN = VFNMS(LDK(KP951056516), TM, VMUL(LDK(KP587785252), TL));
Chris@82 440 TU = VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TM));
Chris@82 441 TS = VFMA(LDK(KP951056516), Ts, VMUL(LDK(KP587785252), Tz));
Chris@82 442 TJ = VFNMS(LDK(KP250000000), TI, TH);
Chris@82 443 TK = VSUB(TE, TJ);
Chris@82 444 TV = VADD(TE, TJ);
Chris@82 445 Tj = VFNMS(LDK(KP250000000), Ti, T3);
Chris@82 446 Tl = VSUB(Tj, Tk);
Chris@82 447 TR = VADD(Tk, Tj);
Chris@82 448 {
Chris@82 449 V TB, TO, T1T, T1U;
Chris@82 450 TB = VSUB(Tl, TA);
Chris@82 451 TO = VBYI(VSUB(TK, TN));
Chris@82 452 T1T = VSUB(TB, TO);
Chris@82 453 STM2(&(xo[34]), T1T, ovs, &(xo[2]));
Chris@82 454 STN2(&(xo[32]), T1N, T1T, ovs);
Chris@82 455 T1U = VADD(TB, TO);
Chris@82 456 STM2(&(xo[6]), T1U, ovs, &(xo[2]));
Chris@82 457 STN2(&(xo[4]), T1R, T1U, ovs);
Chris@82 458 }
Chris@82 459 {
Chris@82 460 V TX, TY, T1V, T1W;
Chris@82 461 TX = VADD(TR, TS);
Chris@82 462 TY = VBYI(VSUB(TV, TU));
Chris@82 463 T1V = VSUB(TX, TY);
Chris@82 464 STM2(&(xo[22]), T1V, ovs, &(xo[2]));
Chris@82 465 STN2(&(xo[20]), T1O, T1V, ovs);
Chris@82 466 T1W = VADD(TX, TY);
Chris@82 467 STM2(&(xo[18]), T1W, ovs, &(xo[2]));
Chris@82 468 STN2(&(xo[16]), T1K, T1W, ovs);
Chris@82 469 }
Chris@82 470 {
Chris@82 471 V TP, TQ, T1X, T1Y;
Chris@82 472 TP = VADD(Tl, TA);
Chris@82 473 TQ = VBYI(VADD(TN, TK));
Chris@82 474 T1X = VSUB(TP, TQ);
Chris@82 475 STM2(&(xo[26]), T1X, ovs, &(xo[2]));
Chris@82 476 STN2(&(xo[24]), T1L, T1X, ovs);
Chris@82 477 T1Y = VADD(TP, TQ);
Chris@82 478 STM2(&(xo[14]), T1Y, ovs, &(xo[2]));
Chris@82 479 STN2(&(xo[12]), T1P, T1Y, ovs);
Chris@82 480 }
Chris@82 481 {
Chris@82 482 V TT, TW, T1Z, T20;
Chris@82 483 TT = VSUB(TR, TS);
Chris@82 484 TW = VBYI(VADD(TU, TV));
Chris@82 485 T1Z = VSUB(TT, TW);
Chris@82 486 STM2(&(xo[38]), T1Z, ovs, &(xo[2]));
Chris@82 487 STN2(&(xo[36]), T1S, T1Z, ovs);
Chris@82 488 T20 = VADD(TT, TW);
Chris@82 489 STM2(&(xo[2]), T20, ovs, &(xo[2]));
Chris@82 490 STN2(&(xo[0]), T1J, T20, ovs);
Chris@82 491 }
Chris@82 492 }
Chris@82 493 }
Chris@82 494 }
Chris@82 495 }
Chris@82 496 }
Chris@82 497 VLEAVE();
Chris@82 498 }
Chris@82 499
Chris@82 500 static const kdft_desc desc = { 20, XSIMD_STRING("n2bv_20"), {92, 12, 12, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 501
Chris@82 502 void XSIMD(codelet_n2bv_20) (planner *p) {
Chris@82 503 X(kdft_register) (p, n2bv_20, &desc);
Chris@82 504 }
Chris@82 505
Chris@82 506 #endif