annotate src/fftw-3.3.3/dft/simd/common/n2bv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:37:46 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n2bv_20 -with-ostride 2 -include n2b.h -store-multiple 2 */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 104 FP additions, 50 FP multiplications,
Chris@10 32 * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
Chris@10 33 * 79 stack variables, 4 constants, and 50 memory accesses
Chris@10 34 */
Chris@10 35 #include "n2b.h"
Chris@10 36
Chris@10 37 static void n2bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@10 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 43 {
Chris@10 44 INT i;
Chris@10 45 const R *xi;
Chris@10 46 R *xo;
Chris@10 47 xi = ii;
Chris@10 48 xo = io;
Chris@10 49 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@10 50 V T1H, T1I, TS, TA, TN, TV, T1M, T1N, T1O, T1P, T1R, T1S, TK, TU, TR;
Chris@10 51 V Tl;
Chris@10 52 {
Chris@10 53 V T3, TE, T1r, T13, Ta, TL, Tz, TG, Ts, TF, Th, TM, T1u, T1C, T1n;
Chris@10 54 V T1a, T1m, T1h, T1x, T1D, Tk, Ti;
Chris@10 55 {
Chris@10 56 V T1, T2, TC, TD;
Chris@10 57 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 58 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 59 TC = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 60 TD = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@10 61 {
Chris@10 62 V T14, T6, T1c, Tv, Tm, T1f, Ty, T17, T9, Tn, Tp, T1b, Td, Tq, Te;
Chris@10 63 V Tf, T15, To;
Chris@10 64 {
Chris@10 65 V Tw, Tx, T7, T8, Tb, Tc;
Chris@10 66 {
Chris@10 67 V T4, T5, Tt, Tu, T11, T12;
Chris@10 68 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 69 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@10 70 Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@10 71 Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 72 Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@10 73 T3 = VSUB(T1, T2);
Chris@10 74 T11 = VADD(T1, T2);
Chris@10 75 TE = VSUB(TC, TD);
Chris@10 76 T12 = VADD(TC, TD);
Chris@10 77 T14 = VADD(T4, T5);
Chris@10 78 T6 = VSUB(T4, T5);
Chris@10 79 T1c = VADD(Tt, Tu);
Chris@10 80 Tv = VSUB(Tt, Tu);
Chris@10 81 Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 82 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@10 83 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 84 T1r = VADD(T11, T12);
Chris@10 85 T13 = VSUB(T11, T12);
Chris@10 86 }
Chris@10 87 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 88 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@10 89 Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 90 T1f = VADD(Tw, Tx);
Chris@10 91 Ty = VSUB(Tw, Tx);
Chris@10 92 T17 = VADD(T7, T8);
Chris@10 93 T9 = VSUB(T7, T8);
Chris@10 94 Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@10 95 Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 96 T1b = VADD(Tb, Tc);
Chris@10 97 Td = VSUB(Tb, Tc);
Chris@10 98 Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@10 99 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@10 100 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 101 }
Chris@10 102 Ta = VADD(T6, T9);
Chris@10 103 TL = VSUB(T6, T9);
Chris@10 104 T15 = VADD(Tm, Tn);
Chris@10 105 To = VSUB(Tm, Tn);
Chris@10 106 Tz = VSUB(Tv, Ty);
Chris@10 107 TG = VADD(Tv, Ty);
Chris@10 108 {
Chris@10 109 V T1d, T1v, T18, Tr, T1e, Tg, T16, T1s;
Chris@10 110 T1d = VSUB(T1b, T1c);
Chris@10 111 T1v = VADD(T1b, T1c);
Chris@10 112 T18 = VADD(Tp, Tq);
Chris@10 113 Tr = VSUB(Tp, Tq);
Chris@10 114 T1e = VADD(Te, Tf);
Chris@10 115 Tg = VSUB(Te, Tf);
Chris@10 116 T16 = VSUB(T14, T15);
Chris@10 117 T1s = VADD(T14, T15);
Chris@10 118 {
Chris@10 119 V T1t, T19, T1w, T1g;
Chris@10 120 T1t = VADD(T17, T18);
Chris@10 121 T19 = VSUB(T17, T18);
Chris@10 122 Ts = VSUB(To, Tr);
Chris@10 123 TF = VADD(To, Tr);
Chris@10 124 T1w = VADD(T1e, T1f);
Chris@10 125 T1g = VSUB(T1e, T1f);
Chris@10 126 Th = VADD(Td, Tg);
Chris@10 127 TM = VSUB(Td, Tg);
Chris@10 128 T1u = VADD(T1s, T1t);
Chris@10 129 T1C = VSUB(T1s, T1t);
Chris@10 130 T1n = VSUB(T16, T19);
Chris@10 131 T1a = VADD(T16, T19);
Chris@10 132 T1m = VSUB(T1d, T1g);
Chris@10 133 T1h = VADD(T1d, T1g);
Chris@10 134 T1x = VADD(T1v, T1w);
Chris@10 135 T1D = VSUB(T1v, T1w);
Chris@10 136 }
Chris@10 137 }
Chris@10 138 }
Chris@10 139 }
Chris@10 140 Tk = VSUB(Ta, Th);
Chris@10 141 Ti = VADD(Ta, Th);
Chris@10 142 {
Chris@10 143 V TJ, T1k, T1A, TZ, Tj, T1E, T1G, TI, T10, T1j, T1z, T1i, T1y, TH;
Chris@10 144 TJ = VSUB(TF, TG);
Chris@10 145 TH = VADD(TF, TG);
Chris@10 146 T1i = VADD(T1a, T1h);
Chris@10 147 T1k = VSUB(T1a, T1h);
Chris@10 148 T1y = VADD(T1u, T1x);
Chris@10 149 T1A = VSUB(T1u, T1x);
Chris@10 150 TZ = VADD(T3, Ti);
Chris@10 151 Tj = VFNMS(LDK(KP250000000), Ti, T3);
Chris@10 152 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
Chris@10 153 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
Chris@10 154 TI = VFNMS(LDK(KP250000000), TH, TE);
Chris@10 155 T10 = VADD(TE, TH);
Chris@10 156 T1j = VFNMS(LDK(KP250000000), T1i, T13);
Chris@10 157 T1H = VADD(T1r, T1y);
Chris@10 158 STM2(&(xo[0]), T1H, ovs, &(xo[0]));
Chris@10 159 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
Chris@10 160 T1I = VADD(T13, T1i);
Chris@10 161 STM2(&(xo[20]), T1I, ovs, &(xo[0]));
Chris@10 162 {
Chris@10 163 V T1J, T1K, T1p, T1l, T1o, T1q, T1F, T1B, T1L, T1Q;
Chris@10 164 TS = VFNMS(LDK(KP618033988), Ts, Tz);
Chris@10 165 TA = VFMA(LDK(KP618033988), Tz, Ts);
Chris@10 166 TN = VFMA(LDK(KP618033988), TM, TL);
Chris@10 167 TV = VFNMS(LDK(KP618033988), TL, TM);
Chris@10 168 T1J = VFMAI(T10, TZ);
Chris@10 169 STM2(&(xo[10]), T1J, ovs, &(xo[2]));
Chris@10 170 T1K = VFNMSI(T10, TZ);
Chris@10 171 STM2(&(xo[30]), T1K, ovs, &(xo[2]));
Chris@10 172 T1p = VFMA(LDK(KP559016994), T1k, T1j);
Chris@10 173 T1l = VFNMS(LDK(KP559016994), T1k, T1j);
Chris@10 174 T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
Chris@10 175 T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
Chris@10 176 T1F = VFNMS(LDK(KP559016994), T1A, T1z);
Chris@10 177 T1B = VFMA(LDK(KP559016994), T1A, T1z);
Chris@10 178 T1L = VFNMSI(T1q, T1p);
Chris@10 179 STM2(&(xo[28]), T1L, ovs, &(xo[0]));
Chris@10 180 STN2(&(xo[28]), T1L, T1K, ovs);
Chris@10 181 T1M = VFMAI(T1q, T1p);
Chris@10 182 STM2(&(xo[12]), T1M, ovs, &(xo[0]));
Chris@10 183 T1N = VFMAI(T1o, T1l);
Chris@10 184 STM2(&(xo[36]), T1N, ovs, &(xo[0]));
Chris@10 185 T1O = VFNMSI(T1o, T1l);
Chris@10 186 STM2(&(xo[4]), T1O, ovs, &(xo[0]));
Chris@10 187 T1P = VFMAI(T1E, T1B);
Chris@10 188 STM2(&(xo[32]), T1P, ovs, &(xo[0]));
Chris@10 189 T1Q = VFNMSI(T1E, T1B);
Chris@10 190 STM2(&(xo[8]), T1Q, ovs, &(xo[0]));
Chris@10 191 STN2(&(xo[8]), T1Q, T1J, ovs);
Chris@10 192 T1R = VFNMSI(T1G, T1F);
Chris@10 193 STM2(&(xo[24]), T1R, ovs, &(xo[0]));
Chris@10 194 T1S = VFMAI(T1G, T1F);
Chris@10 195 STM2(&(xo[16]), T1S, ovs, &(xo[0]));
Chris@10 196 TK = VFMA(LDK(KP559016994), TJ, TI);
Chris@10 197 TU = VFNMS(LDK(KP559016994), TJ, TI);
Chris@10 198 TR = VFNMS(LDK(KP559016994), Tk, Tj);
Chris@10 199 Tl = VFMA(LDK(KP559016994), Tk, Tj);
Chris@10 200 }
Chris@10 201 }
Chris@10 202 }
Chris@10 203 {
Chris@10 204 V TY, TW, TO, TQ, TB, TP, TX, TT;
Chris@10 205 TY = VFMA(LDK(KP951056516), TV, TU);
Chris@10 206 TW = VFNMS(LDK(KP951056516), TV, TU);
Chris@10 207 TO = VFMA(LDK(KP951056516), TN, TK);
Chris@10 208 TQ = VFNMS(LDK(KP951056516), TN, TK);
Chris@10 209 TB = VFNMS(LDK(KP951056516), TA, Tl);
Chris@10 210 TP = VFMA(LDK(KP951056516), TA, Tl);
Chris@10 211 TX = VFNMS(LDK(KP951056516), TS, TR);
Chris@10 212 TT = VFMA(LDK(KP951056516), TS, TR);
Chris@10 213 {
Chris@10 214 V T1T, T1U, T1V, T1W;
Chris@10 215 T1T = VFMAI(TQ, TP);
Chris@10 216 STM2(&(xo[18]), T1T, ovs, &(xo[2]));
Chris@10 217 STN2(&(xo[16]), T1S, T1T, ovs);
Chris@10 218 T1U = VFNMSI(TQ, TP);
Chris@10 219 STM2(&(xo[22]), T1U, ovs, &(xo[2]));
Chris@10 220 STN2(&(xo[20]), T1I, T1U, ovs);
Chris@10 221 T1V = VFMAI(TO, TB);
Chris@10 222 STM2(&(xo[2]), T1V, ovs, &(xo[2]));
Chris@10 223 STN2(&(xo[0]), T1H, T1V, ovs);
Chris@10 224 T1W = VFNMSI(TO, TB);
Chris@10 225 STM2(&(xo[38]), T1W, ovs, &(xo[2]));
Chris@10 226 STN2(&(xo[36]), T1N, T1W, ovs);
Chris@10 227 {
Chris@10 228 V T1X, T1Y, T1Z, T20;
Chris@10 229 T1X = VFMAI(TW, TT);
Chris@10 230 STM2(&(xo[34]), T1X, ovs, &(xo[2]));
Chris@10 231 STN2(&(xo[32]), T1P, T1X, ovs);
Chris@10 232 T1Y = VFNMSI(TW, TT);
Chris@10 233 STM2(&(xo[6]), T1Y, ovs, &(xo[2]));
Chris@10 234 STN2(&(xo[4]), T1O, T1Y, ovs);
Chris@10 235 T1Z = VFMAI(TY, TX);
Chris@10 236 STM2(&(xo[26]), T1Z, ovs, &(xo[2]));
Chris@10 237 STN2(&(xo[24]), T1R, T1Z, ovs);
Chris@10 238 T20 = VFNMSI(TY, TX);
Chris@10 239 STM2(&(xo[14]), T20, ovs, &(xo[2]));
Chris@10 240 STN2(&(xo[12]), T1M, T20, ovs);
Chris@10 241 }
Chris@10 242 }
Chris@10 243 }
Chris@10 244 }
Chris@10 245 }
Chris@10 246 VLEAVE();
Chris@10 247 }
Chris@10 248
Chris@10 249 static const kdft_desc desc = { 20, XSIMD_STRING("n2bv_20"), {58, 4, 46, 0}, &GENUS, 0, 2, 0, 0 };
Chris@10 250
Chris@10 251 void XSIMD(codelet_n2bv_20) (planner *p) {
Chris@10 252 X(kdft_register) (p, n2bv_20, &desc);
Chris@10 253 }
Chris@10 254
Chris@10 255 #else /* HAVE_FMA */
Chris@10 256
Chris@10 257 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n2bv_20 -with-ostride 2 -include n2b.h -store-multiple 2 */
Chris@10 258
Chris@10 259 /*
Chris@10 260 * This function contains 104 FP additions, 24 FP multiplications,
Chris@10 261 * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
Chris@10 262 * 57 stack variables, 4 constants, and 50 memory accesses
Chris@10 263 */
Chris@10 264 #include "n2b.h"
Chris@10 265
Chris@10 266 static void n2bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 267 {
Chris@10 268 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@10 269 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 270 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 271 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 272 {
Chris@10 273 INT i;
Chris@10 274 const R *xi;
Chris@10 275 R *xo;
Chris@10 276 xi = ii;
Chris@10 277 xo = io;
Chris@10 278 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@10 279 V T3, T1y, TH, T1i, Ts, TL, TM, Tz, T13, T16, T1j, T1u, T1v, T1w, T1r;
Chris@10 280 V T1s, T1t, T1a, T1d, T1k, Ti, Tk, TE, TI;
Chris@10 281 {
Chris@10 282 V T1, T2, T1g, TF, TG, T1h;
Chris@10 283 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 284 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 285 T1g = VADD(T1, T2);
Chris@10 286 TF = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 287 TG = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@10 288 T1h = VADD(TF, TG);
Chris@10 289 T3 = VSUB(T1, T2);
Chris@10 290 T1y = VADD(T1g, T1h);
Chris@10 291 TH = VSUB(TF, TG);
Chris@10 292 T1i = VSUB(T1g, T1h);
Chris@10 293 }
Chris@10 294 {
Chris@10 295 V T6, T11, Tv, T19, Ty, T1c, T9, T14, Td, T18, To, T12, Tr, T15, Tg;
Chris@10 296 V T1b;
Chris@10 297 {
Chris@10 298 V T4, T5, Tt, Tu;
Chris@10 299 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 300 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@10 301 T6 = VSUB(T4, T5);
Chris@10 302 T11 = VADD(T4, T5);
Chris@10 303 Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@10 304 Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 305 Tv = VSUB(Tt, Tu);
Chris@10 306 T19 = VADD(Tt, Tu);
Chris@10 307 }
Chris@10 308 {
Chris@10 309 V Tw, Tx, T7, T8;
Chris@10 310 Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@10 311 Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 312 Ty = VSUB(Tw, Tx);
Chris@10 313 T1c = VADD(Tw, Tx);
Chris@10 314 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@10 315 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 316 T9 = VSUB(T7, T8);
Chris@10 317 T14 = VADD(T7, T8);
Chris@10 318 }
Chris@10 319 {
Chris@10 320 V Tb, Tc, Tm, Tn;
Chris@10 321 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 322 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@10 323 Td = VSUB(Tb, Tc);
Chris@10 324 T18 = VADD(Tb, Tc);
Chris@10 325 Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 326 Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@10 327 To = VSUB(Tm, Tn);
Chris@10 328 T12 = VADD(Tm, Tn);
Chris@10 329 }
Chris@10 330 {
Chris@10 331 V Tp, Tq, Te, Tf;
Chris@10 332 Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 333 Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@10 334 Tr = VSUB(Tp, Tq);
Chris@10 335 T15 = VADD(Tp, Tq);
Chris@10 336 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@10 337 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 338 Tg = VSUB(Te, Tf);
Chris@10 339 T1b = VADD(Te, Tf);
Chris@10 340 }
Chris@10 341 Ts = VSUB(To, Tr);
Chris@10 342 TL = VSUB(T6, T9);
Chris@10 343 TM = VSUB(Td, Tg);
Chris@10 344 Tz = VSUB(Tv, Ty);
Chris@10 345 T13 = VSUB(T11, T12);
Chris@10 346 T16 = VSUB(T14, T15);
Chris@10 347 T1j = VADD(T13, T16);
Chris@10 348 T1u = VADD(T18, T19);
Chris@10 349 T1v = VADD(T1b, T1c);
Chris@10 350 T1w = VADD(T1u, T1v);
Chris@10 351 T1r = VADD(T11, T12);
Chris@10 352 T1s = VADD(T14, T15);
Chris@10 353 T1t = VADD(T1r, T1s);
Chris@10 354 T1a = VSUB(T18, T19);
Chris@10 355 T1d = VSUB(T1b, T1c);
Chris@10 356 T1k = VADD(T1a, T1d);
Chris@10 357 {
Chris@10 358 V Ta, Th, TC, TD;
Chris@10 359 Ta = VADD(T6, T9);
Chris@10 360 Th = VADD(Td, Tg);
Chris@10 361 Ti = VADD(Ta, Th);
Chris@10 362 Tk = VMUL(LDK(KP559016994), VSUB(Ta, Th));
Chris@10 363 TC = VADD(To, Tr);
Chris@10 364 TD = VADD(Tv, Ty);
Chris@10 365 TE = VMUL(LDK(KP559016994), VSUB(TC, TD));
Chris@10 366 TI = VADD(TC, TD);
Chris@10 367 }
Chris@10 368 }
Chris@10 369 {
Chris@10 370 V T1H, T1J, T1K, T1L, T1N, T1I, TZ, T10;
Chris@10 371 TZ = VADD(T3, Ti);
Chris@10 372 T10 = VBYI(VADD(TH, TI));
Chris@10 373 T1H = VSUB(TZ, T10);
Chris@10 374 STM2(&(xo[30]), T1H, ovs, &(xo[2]));
Chris@10 375 T1I = VADD(TZ, T10);
Chris@10 376 STM2(&(xo[10]), T1I, ovs, &(xo[2]));
Chris@10 377 {
Chris@10 378 V T1x, T1z, T1A, T1E, T1G, T1C, T1D, T1F, T1B, T1M;
Chris@10 379 T1x = VMUL(LDK(KP559016994), VSUB(T1t, T1w));
Chris@10 380 T1z = VADD(T1t, T1w);
Chris@10 381 T1A = VFNMS(LDK(KP250000000), T1z, T1y);
Chris@10 382 T1C = VSUB(T1r, T1s);
Chris@10 383 T1D = VSUB(T1u, T1v);
Chris@10 384 T1E = VBYI(VFMA(LDK(KP951056516), T1C, VMUL(LDK(KP587785252), T1D)));
Chris@10 385 T1G = VBYI(VFNMS(LDK(KP951056516), T1D, VMUL(LDK(KP587785252), T1C)));
Chris@10 386 T1J = VADD(T1y, T1z);
Chris@10 387 STM2(&(xo[0]), T1J, ovs, &(xo[0]));
Chris@10 388 T1F = VSUB(T1A, T1x);
Chris@10 389 T1K = VSUB(T1F, T1G);
Chris@10 390 STM2(&(xo[16]), T1K, ovs, &(xo[0]));
Chris@10 391 T1L = VADD(T1G, T1F);
Chris@10 392 STM2(&(xo[24]), T1L, ovs, &(xo[0]));
Chris@10 393 T1B = VADD(T1x, T1A);
Chris@10 394 T1M = VSUB(T1B, T1E);
Chris@10 395 STM2(&(xo[8]), T1M, ovs, &(xo[0]));
Chris@10 396 STN2(&(xo[8]), T1M, T1I, ovs);
Chris@10 397 T1N = VADD(T1E, T1B);
Chris@10 398 STM2(&(xo[32]), T1N, ovs, &(xo[0]));
Chris@10 399 }
Chris@10 400 {
Chris@10 401 V T1O, T1P, T1R, T1S;
Chris@10 402 {
Chris@10 403 V T1n, T1l, T1m, T1f, T1p, T17, T1e, T1q, T1Q, T1o;
Chris@10 404 T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
Chris@10 405 T1l = VADD(T1j, T1k);
Chris@10 406 T1m = VFNMS(LDK(KP250000000), T1l, T1i);
Chris@10 407 T17 = VSUB(T13, T16);
Chris@10 408 T1e = VSUB(T1a, T1d);
Chris@10 409 T1f = VBYI(VFNMS(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
Chris@10 410 T1p = VBYI(VFMA(LDK(KP951056516), T17, VMUL(LDK(KP587785252), T1e)));
Chris@10 411 T1O = VADD(T1i, T1l);
Chris@10 412 STM2(&(xo[20]), T1O, ovs, &(xo[0]));
Chris@10 413 T1q = VADD(T1n, T1m);
Chris@10 414 T1P = VADD(T1p, T1q);
Chris@10 415 STM2(&(xo[12]), T1P, ovs, &(xo[0]));
Chris@10 416 T1Q = VSUB(T1q, T1p);
Chris@10 417 STM2(&(xo[28]), T1Q, ovs, &(xo[0]));
Chris@10 418 STN2(&(xo[28]), T1Q, T1H, ovs);
Chris@10 419 T1o = VSUB(T1m, T1n);
Chris@10 420 T1R = VADD(T1f, T1o);
Chris@10 421 STM2(&(xo[4]), T1R, ovs, &(xo[0]));
Chris@10 422 T1S = VSUB(T1o, T1f);
Chris@10 423 STM2(&(xo[36]), T1S, ovs, &(xo[0]));
Chris@10 424 }
Chris@10 425 {
Chris@10 426 V TA, TN, TU, TS, TK, TV, Tl, TR, TJ, Tj;
Chris@10 427 TA = VFNMS(LDK(KP951056516), Tz, VMUL(LDK(KP587785252), Ts));
Chris@10 428 TN = VFNMS(LDK(KP951056516), TM, VMUL(LDK(KP587785252), TL));
Chris@10 429 TU = VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TM));
Chris@10 430 TS = VFMA(LDK(KP951056516), Ts, VMUL(LDK(KP587785252), Tz));
Chris@10 431 TJ = VFNMS(LDK(KP250000000), TI, TH);
Chris@10 432 TK = VSUB(TE, TJ);
Chris@10 433 TV = VADD(TE, TJ);
Chris@10 434 Tj = VFNMS(LDK(KP250000000), Ti, T3);
Chris@10 435 Tl = VSUB(Tj, Tk);
Chris@10 436 TR = VADD(Tk, Tj);
Chris@10 437 {
Chris@10 438 V TB, TO, T1T, T1U;
Chris@10 439 TB = VSUB(Tl, TA);
Chris@10 440 TO = VBYI(VSUB(TK, TN));
Chris@10 441 T1T = VSUB(TB, TO);
Chris@10 442 STM2(&(xo[34]), T1T, ovs, &(xo[2]));
Chris@10 443 STN2(&(xo[32]), T1N, T1T, ovs);
Chris@10 444 T1U = VADD(TB, TO);
Chris@10 445 STM2(&(xo[6]), T1U, ovs, &(xo[2]));
Chris@10 446 STN2(&(xo[4]), T1R, T1U, ovs);
Chris@10 447 }
Chris@10 448 {
Chris@10 449 V TX, TY, T1V, T1W;
Chris@10 450 TX = VADD(TR, TS);
Chris@10 451 TY = VBYI(VSUB(TV, TU));
Chris@10 452 T1V = VSUB(TX, TY);
Chris@10 453 STM2(&(xo[22]), T1V, ovs, &(xo[2]));
Chris@10 454 STN2(&(xo[20]), T1O, T1V, ovs);
Chris@10 455 T1W = VADD(TX, TY);
Chris@10 456 STM2(&(xo[18]), T1W, ovs, &(xo[2]));
Chris@10 457 STN2(&(xo[16]), T1K, T1W, ovs);
Chris@10 458 }
Chris@10 459 {
Chris@10 460 V TP, TQ, T1X, T1Y;
Chris@10 461 TP = VADD(Tl, TA);
Chris@10 462 TQ = VBYI(VADD(TN, TK));
Chris@10 463 T1X = VSUB(TP, TQ);
Chris@10 464 STM2(&(xo[26]), T1X, ovs, &(xo[2]));
Chris@10 465 STN2(&(xo[24]), T1L, T1X, ovs);
Chris@10 466 T1Y = VADD(TP, TQ);
Chris@10 467 STM2(&(xo[14]), T1Y, ovs, &(xo[2]));
Chris@10 468 STN2(&(xo[12]), T1P, T1Y, ovs);
Chris@10 469 }
Chris@10 470 {
Chris@10 471 V TT, TW, T1Z, T20;
Chris@10 472 TT = VSUB(TR, TS);
Chris@10 473 TW = VBYI(VADD(TU, TV));
Chris@10 474 T1Z = VSUB(TT, TW);
Chris@10 475 STM2(&(xo[38]), T1Z, ovs, &(xo[2]));
Chris@10 476 STN2(&(xo[36]), T1S, T1Z, ovs);
Chris@10 477 T20 = VADD(TT, TW);
Chris@10 478 STM2(&(xo[2]), T20, ovs, &(xo[2]));
Chris@10 479 STN2(&(xo[0]), T1J, T20, ovs);
Chris@10 480 }
Chris@10 481 }
Chris@10 482 }
Chris@10 483 }
Chris@10 484 }
Chris@10 485 }
Chris@10 486 VLEAVE();
Chris@10 487 }
Chris@10 488
Chris@10 489 static const kdft_desc desc = { 20, XSIMD_STRING("n2bv_20"), {92, 12, 12, 0}, &GENUS, 0, 2, 0, 0 };
Chris@10 490
Chris@10 491 void XSIMD(codelet_n2bv_20) (planner *p) {
Chris@10 492 X(kdft_register) (p, n2bv_20, &desc);
Chris@10 493 }
Chris@10 494
Chris@10 495 #endif /* HAVE_FMA */