annotate src/fftw-3.3.5/dft/simd/common/n2fv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:40:30 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n2fv_20 -with-ostride 2 -include n2f.h -store-multiple 2 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 104 FP additions, 50 FP multiplications,
Chris@42 32 * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
Chris@42 33 * 79 stack variables, 4 constants, and 50 memory accesses
Chris@42 34 */
Chris@42 35 #include "n2f.h"
Chris@42 36
Chris@42 37 static void n2fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 43 {
Chris@42 44 INT i;
Chris@42 45 const R *xi;
Chris@42 46 R *xo;
Chris@42 47 xi = ri;
Chris@42 48 xo = ro;
Chris@42 49 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@42 50 V T1H, T1I, TU, TI, TP, TX, T1M, T1N, T1O, T1P, T1R, T1S, TM, TW, TT;
Chris@42 51 V TF;
Chris@42 52 {
Chris@42 53 V T3, Tm, T1r, T13, Ta, TN, TH, TA, TG, Tt, Th, TO, T1u, T1C, T1n;
Chris@42 54 V T1a, T1m, T1h, T1x, T1D, TE, Ti;
Chris@42 55 {
Chris@42 56 V T1, T2, Tk, Tl;
Chris@42 57 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 58 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 59 Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 60 Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 61 {
Chris@42 62 V T14, T6, T1c, Tw, Tn, T1f, Tz, T17, T9, To, Tq, T1b, Td, Tr, Te;
Chris@42 63 V Tf, T15, Tp;
Chris@42 64 {
Chris@42 65 V Tx, Ty, T7, T8, Tb, Tc;
Chris@42 66 {
Chris@42 67 V T4, T5, Tu, Tv, T11, T12;
Chris@42 68 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 69 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 70 Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 71 Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 72 Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 73 T3 = VSUB(T1, T2);
Chris@42 74 T11 = VADD(T1, T2);
Chris@42 75 Tm = VSUB(Tk, Tl);
Chris@42 76 T12 = VADD(Tk, Tl);
Chris@42 77 T14 = VADD(T4, T5);
Chris@42 78 T6 = VSUB(T4, T5);
Chris@42 79 T1c = VADD(Tu, Tv);
Chris@42 80 Tw = VSUB(Tu, Tv);
Chris@42 81 Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 82 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 83 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 84 T1r = VADD(T11, T12);
Chris@42 85 T13 = VSUB(T11, T12);
Chris@42 86 }
Chris@42 87 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 88 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 89 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 90 T1f = VADD(Tx, Ty);
Chris@42 91 Tz = VSUB(Tx, Ty);
Chris@42 92 T17 = VADD(T7, T8);
Chris@42 93 T9 = VSUB(T7, T8);
Chris@42 94 To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 95 Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 96 T1b = VADD(Tb, Tc);
Chris@42 97 Td = VSUB(Tb, Tc);
Chris@42 98 Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 99 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 100 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 101 }
Chris@42 102 Ta = VADD(T6, T9);
Chris@42 103 TN = VSUB(T6, T9);
Chris@42 104 T15 = VADD(Tn, To);
Chris@42 105 Tp = VSUB(Tn, To);
Chris@42 106 TH = VSUB(Tz, Tw);
Chris@42 107 TA = VADD(Tw, Tz);
Chris@42 108 {
Chris@42 109 V T1d, T1v, T18, Ts, T1e, Tg, T16, T1s;
Chris@42 110 T1d = VSUB(T1b, T1c);
Chris@42 111 T1v = VADD(T1b, T1c);
Chris@42 112 T18 = VADD(Tq, Tr);
Chris@42 113 Ts = VSUB(Tq, Tr);
Chris@42 114 T1e = VADD(Te, Tf);
Chris@42 115 Tg = VSUB(Te, Tf);
Chris@42 116 T16 = VSUB(T14, T15);
Chris@42 117 T1s = VADD(T14, T15);
Chris@42 118 {
Chris@42 119 V T1t, T19, T1w, T1g;
Chris@42 120 T1t = VADD(T17, T18);
Chris@42 121 T19 = VSUB(T17, T18);
Chris@42 122 TG = VSUB(Ts, Tp);
Chris@42 123 Tt = VADD(Tp, Ts);
Chris@42 124 T1w = VADD(T1e, T1f);
Chris@42 125 T1g = VSUB(T1e, T1f);
Chris@42 126 Th = VADD(Td, Tg);
Chris@42 127 TO = VSUB(Td, Tg);
Chris@42 128 T1u = VADD(T1s, T1t);
Chris@42 129 T1C = VSUB(T1s, T1t);
Chris@42 130 T1n = VSUB(T16, T19);
Chris@42 131 T1a = VADD(T16, T19);
Chris@42 132 T1m = VSUB(T1d, T1g);
Chris@42 133 T1h = VADD(T1d, T1g);
Chris@42 134 T1x = VADD(T1v, T1w);
Chris@42 135 T1D = VSUB(T1v, T1w);
Chris@42 136 }
Chris@42 137 }
Chris@42 138 }
Chris@42 139 }
Chris@42 140 TE = VSUB(Ta, Th);
Chris@42 141 Ti = VADD(Ta, Th);
Chris@42 142 {
Chris@42 143 V TL, T1k, T1A, Tj, TD, T1E, T1G, TK, TC, T1j, T1z, T1i, T1y, TB;
Chris@42 144 TL = VSUB(TA, Tt);
Chris@42 145 TB = VADD(Tt, TA);
Chris@42 146 T1i = VADD(T1a, T1h);
Chris@42 147 T1k = VSUB(T1a, T1h);
Chris@42 148 T1y = VADD(T1u, T1x);
Chris@42 149 T1A = VSUB(T1u, T1x);
Chris@42 150 Tj = VADD(T3, Ti);
Chris@42 151 TD = VFNMS(LDK(KP250000000), Ti, T3);
Chris@42 152 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
Chris@42 153 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
Chris@42 154 TK = VFNMS(LDK(KP250000000), TB, Tm);
Chris@42 155 TC = VADD(Tm, TB);
Chris@42 156 T1j = VFNMS(LDK(KP250000000), T1i, T13);
Chris@42 157 T1H = VADD(T1r, T1y);
Chris@42 158 STM2(&(xo[0]), T1H, ovs, &(xo[0]));
Chris@42 159 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
Chris@42 160 T1I = VADD(T13, T1i);
Chris@42 161 STM2(&(xo[20]), T1I, ovs, &(xo[0]));
Chris@42 162 {
Chris@42 163 V T1J, T1K, T1p, T1l, T1o, T1q, T1F, T1B, T1L, T1Q;
Chris@42 164 TU = VFNMS(LDK(KP618033988), TG, TH);
Chris@42 165 TI = VFMA(LDK(KP618033988), TH, TG);
Chris@42 166 TP = VFMA(LDK(KP618033988), TO, TN);
Chris@42 167 TX = VFNMS(LDK(KP618033988), TN, TO);
Chris@42 168 T1J = VFMAI(TC, Tj);
Chris@42 169 STM2(&(xo[30]), T1J, ovs, &(xo[2]));
Chris@42 170 T1K = VFNMSI(TC, Tj);
Chris@42 171 STM2(&(xo[10]), T1K, ovs, &(xo[2]));
Chris@42 172 T1p = VFMA(LDK(KP559016994), T1k, T1j);
Chris@42 173 T1l = VFNMS(LDK(KP559016994), T1k, T1j);
Chris@42 174 T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
Chris@42 175 T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
Chris@42 176 T1F = VFNMS(LDK(KP559016994), T1A, T1z);
Chris@42 177 T1B = VFMA(LDK(KP559016994), T1A, T1z);
Chris@42 178 T1L = VFMAI(T1q, T1p);
Chris@42 179 STM2(&(xo[28]), T1L, ovs, &(xo[0]));
Chris@42 180 STN2(&(xo[28]), T1L, T1J, ovs);
Chris@42 181 T1M = VFNMSI(T1q, T1p);
Chris@42 182 STM2(&(xo[12]), T1M, ovs, &(xo[0]));
Chris@42 183 T1N = VFNMSI(T1o, T1l);
Chris@42 184 STM2(&(xo[36]), T1N, ovs, &(xo[0]));
Chris@42 185 T1O = VFMAI(T1o, T1l);
Chris@42 186 STM2(&(xo[4]), T1O, ovs, &(xo[0]));
Chris@42 187 T1P = VFNMSI(T1E, T1B);
Chris@42 188 STM2(&(xo[32]), T1P, ovs, &(xo[0]));
Chris@42 189 T1Q = VFMAI(T1E, T1B);
Chris@42 190 STM2(&(xo[8]), T1Q, ovs, &(xo[0]));
Chris@42 191 STN2(&(xo[8]), T1Q, T1K, ovs);
Chris@42 192 T1R = VFMAI(T1G, T1F);
Chris@42 193 STM2(&(xo[24]), T1R, ovs, &(xo[0]));
Chris@42 194 T1S = VFNMSI(T1G, T1F);
Chris@42 195 STM2(&(xo[16]), T1S, ovs, &(xo[0]));
Chris@42 196 TM = VFNMS(LDK(KP559016994), TL, TK);
Chris@42 197 TW = VFMA(LDK(KP559016994), TL, TK);
Chris@42 198 TT = VFNMS(LDK(KP559016994), TE, TD);
Chris@42 199 TF = VFMA(LDK(KP559016994), TE, TD);
Chris@42 200 }
Chris@42 201 }
Chris@42 202 }
Chris@42 203 {
Chris@42 204 V T10, TY, TQ, TS, TJ, TR, TZ, TV;
Chris@42 205 T10 = VFMA(LDK(KP951056516), TX, TW);
Chris@42 206 TY = VFNMS(LDK(KP951056516), TX, TW);
Chris@42 207 TQ = VFMA(LDK(KP951056516), TP, TM);
Chris@42 208 TS = VFNMS(LDK(KP951056516), TP, TM);
Chris@42 209 TJ = VFMA(LDK(KP951056516), TI, TF);
Chris@42 210 TR = VFNMS(LDK(KP951056516), TI, TF);
Chris@42 211 TZ = VFMA(LDK(KP951056516), TU, TT);
Chris@42 212 TV = VFNMS(LDK(KP951056516), TU, TT);
Chris@42 213 {
Chris@42 214 V T1T, T1U, T1V, T1W;
Chris@42 215 T1T = VFMAI(TS, TR);
Chris@42 216 STM2(&(xo[22]), T1T, ovs, &(xo[2]));
Chris@42 217 STN2(&(xo[20]), T1I, T1T, ovs);
Chris@42 218 T1U = VFNMSI(TS, TR);
Chris@42 219 STM2(&(xo[18]), T1U, ovs, &(xo[2]));
Chris@42 220 STN2(&(xo[16]), T1S, T1U, ovs);
Chris@42 221 T1V = VFMAI(TQ, TJ);
Chris@42 222 STM2(&(xo[38]), T1V, ovs, &(xo[2]));
Chris@42 223 STN2(&(xo[36]), T1N, T1V, ovs);
Chris@42 224 T1W = VFNMSI(TQ, TJ);
Chris@42 225 STM2(&(xo[2]), T1W, ovs, &(xo[2]));
Chris@42 226 STN2(&(xo[0]), T1H, T1W, ovs);
Chris@42 227 {
Chris@42 228 V T1X, T1Y, T1Z, T20;
Chris@42 229 T1X = VFMAI(TY, TV);
Chris@42 230 STM2(&(xo[6]), T1X, ovs, &(xo[2]));
Chris@42 231 STN2(&(xo[4]), T1O, T1X, ovs);
Chris@42 232 T1Y = VFNMSI(TY, TV);
Chris@42 233 STM2(&(xo[34]), T1Y, ovs, &(xo[2]));
Chris@42 234 STN2(&(xo[32]), T1P, T1Y, ovs);
Chris@42 235 T1Z = VFMAI(T10, TZ);
Chris@42 236 STM2(&(xo[14]), T1Z, ovs, &(xo[2]));
Chris@42 237 STN2(&(xo[12]), T1M, T1Z, ovs);
Chris@42 238 T20 = VFNMSI(T10, TZ);
Chris@42 239 STM2(&(xo[26]), T20, ovs, &(xo[2]));
Chris@42 240 STN2(&(xo[24]), T1R, T20, ovs);
Chris@42 241 }
Chris@42 242 }
Chris@42 243 }
Chris@42 244 }
Chris@42 245 }
Chris@42 246 VLEAVE();
Chris@42 247 }
Chris@42 248
Chris@42 249 static const kdft_desc desc = { 20, XSIMD_STRING("n2fv_20"), {58, 4, 46, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 250
Chris@42 251 void XSIMD(codelet_n2fv_20) (planner *p) {
Chris@42 252 X(kdft_register) (p, n2fv_20, &desc);
Chris@42 253 }
Chris@42 254
Chris@42 255 #else /* HAVE_FMA */
Chris@42 256
Chris@42 257 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n2fv_20 -with-ostride 2 -include n2f.h -store-multiple 2 */
Chris@42 258
Chris@42 259 /*
Chris@42 260 * This function contains 104 FP additions, 24 FP multiplications,
Chris@42 261 * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
Chris@42 262 * 57 stack variables, 4 constants, and 50 memory accesses
Chris@42 263 */
Chris@42 264 #include "n2f.h"
Chris@42 265
Chris@42 266 static void n2fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 267 {
Chris@42 268 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 269 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 270 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 271 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 272 {
Chris@42 273 INT i;
Chris@42 274 const R *xi;
Chris@42 275 R *xo;
Chris@42 276 xi = ri;
Chris@42 277 xo = ro;
Chris@42 278 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@42 279 V T3, T1B, Tm, T1i, TG, TN, TO, TH, T13, T16, T1k, T1u, T1v, T1z, T1r;
Chris@42 280 V T1s, T1y, T1a, T1d, T1j, Ti, TD, TB, TL;
Chris@42 281 {
Chris@42 282 V T1, T2, T1g, Tk, Tl, T1h;
Chris@42 283 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 284 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 285 T1g = VADD(T1, T2);
Chris@42 286 Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 287 Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 288 T1h = VADD(Tk, Tl);
Chris@42 289 T3 = VSUB(T1, T2);
Chris@42 290 T1B = VADD(T1g, T1h);
Chris@42 291 Tm = VSUB(Tk, Tl);
Chris@42 292 T1i = VSUB(T1g, T1h);
Chris@42 293 }
Chris@42 294 {
Chris@42 295 V T6, T18, Tw, T12, Tz, T15, T9, T1b, Td, T11, Tp, T19, Ts, T1c, Tg;
Chris@42 296 V T14;
Chris@42 297 {
Chris@42 298 V T4, T5, Tu, Tv;
Chris@42 299 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 300 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 301 T6 = VSUB(T4, T5);
Chris@42 302 T18 = VADD(T4, T5);
Chris@42 303 Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 304 Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 305 Tw = VSUB(Tu, Tv);
Chris@42 306 T12 = VADD(Tu, Tv);
Chris@42 307 }
Chris@42 308 {
Chris@42 309 V Tx, Ty, T7, T8;
Chris@42 310 Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 311 Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 312 Tz = VSUB(Tx, Ty);
Chris@42 313 T15 = VADD(Tx, Ty);
Chris@42 314 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 315 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 316 T9 = VSUB(T7, T8);
Chris@42 317 T1b = VADD(T7, T8);
Chris@42 318 }
Chris@42 319 {
Chris@42 320 V Tb, Tc, Tn, To;
Chris@42 321 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 322 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 323 Td = VSUB(Tb, Tc);
Chris@42 324 T11 = VADD(Tb, Tc);
Chris@42 325 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 326 To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 327 Tp = VSUB(Tn, To);
Chris@42 328 T19 = VADD(Tn, To);
Chris@42 329 }
Chris@42 330 {
Chris@42 331 V Tq, Tr, Te, Tf;
Chris@42 332 Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 333 Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 334 Ts = VSUB(Tq, Tr);
Chris@42 335 T1c = VADD(Tq, Tr);
Chris@42 336 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 337 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 338 Tg = VSUB(Te, Tf);
Chris@42 339 T14 = VADD(Te, Tf);
Chris@42 340 }
Chris@42 341 TG = VSUB(Ts, Tp);
Chris@42 342 TN = VSUB(T6, T9);
Chris@42 343 TO = VSUB(Td, Tg);
Chris@42 344 TH = VSUB(Tz, Tw);
Chris@42 345 T13 = VSUB(T11, T12);
Chris@42 346 T16 = VSUB(T14, T15);
Chris@42 347 T1k = VADD(T13, T16);
Chris@42 348 T1u = VADD(T11, T12);
Chris@42 349 T1v = VADD(T14, T15);
Chris@42 350 T1z = VADD(T1u, T1v);
Chris@42 351 T1r = VADD(T18, T19);
Chris@42 352 T1s = VADD(T1b, T1c);
Chris@42 353 T1y = VADD(T1r, T1s);
Chris@42 354 T1a = VSUB(T18, T19);
Chris@42 355 T1d = VSUB(T1b, T1c);
Chris@42 356 T1j = VADD(T1a, T1d);
Chris@42 357 {
Chris@42 358 V Ta, Th, Tt, TA;
Chris@42 359 Ta = VADD(T6, T9);
Chris@42 360 Th = VADD(Td, Tg);
Chris@42 361 Ti = VADD(Ta, Th);
Chris@42 362 TD = VMUL(LDK(KP559016994), VSUB(Ta, Th));
Chris@42 363 Tt = VADD(Tp, Ts);
Chris@42 364 TA = VADD(Tw, Tz);
Chris@42 365 TB = VADD(Tt, TA);
Chris@42 366 TL = VMUL(LDK(KP559016994), VSUB(TA, Tt));
Chris@42 367 }
Chris@42 368 }
Chris@42 369 {
Chris@42 370 V T1I, T1J, T1K, T1L, T1N, T1H, Tj, TC;
Chris@42 371 Tj = VADD(T3, Ti);
Chris@42 372 TC = VBYI(VADD(Tm, TB));
Chris@42 373 T1H = VSUB(Tj, TC);
Chris@42 374 STM2(&(xo[10]), T1H, ovs, &(xo[2]));
Chris@42 375 T1I = VADD(Tj, TC);
Chris@42 376 STM2(&(xo[30]), T1I, ovs, &(xo[2]));
Chris@42 377 {
Chris@42 378 V T1A, T1C, T1D, T1x, T1G, T1t, T1w, T1F, T1E, T1M;
Chris@42 379 T1A = VMUL(LDK(KP559016994), VSUB(T1y, T1z));
Chris@42 380 T1C = VADD(T1y, T1z);
Chris@42 381 T1D = VFNMS(LDK(KP250000000), T1C, T1B);
Chris@42 382 T1t = VSUB(T1r, T1s);
Chris@42 383 T1w = VSUB(T1u, T1v);
Chris@42 384 T1x = VBYI(VFMA(LDK(KP951056516), T1t, VMUL(LDK(KP587785252), T1w)));
Chris@42 385 T1G = VBYI(VFNMS(LDK(KP587785252), T1t, VMUL(LDK(KP951056516), T1w)));
Chris@42 386 T1J = VADD(T1B, T1C);
Chris@42 387 STM2(&(xo[0]), T1J, ovs, &(xo[0]));
Chris@42 388 T1F = VSUB(T1D, T1A);
Chris@42 389 T1K = VSUB(T1F, T1G);
Chris@42 390 STM2(&(xo[16]), T1K, ovs, &(xo[0]));
Chris@42 391 T1L = VADD(T1G, T1F);
Chris@42 392 STM2(&(xo[24]), T1L, ovs, &(xo[0]));
Chris@42 393 T1E = VADD(T1A, T1D);
Chris@42 394 T1M = VADD(T1x, T1E);
Chris@42 395 STM2(&(xo[8]), T1M, ovs, &(xo[0]));
Chris@42 396 STN2(&(xo[8]), T1M, T1H, ovs);
Chris@42 397 T1N = VSUB(T1E, T1x);
Chris@42 398 STM2(&(xo[32]), T1N, ovs, &(xo[0]));
Chris@42 399 }
Chris@42 400 {
Chris@42 401 V T1O, T1P, T1R, T1S;
Chris@42 402 {
Chris@42 403 V T1n, T1l, T1m, T1f, T1q, T17, T1e, T1p, T1Q, T1o;
Chris@42 404 T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
Chris@42 405 T1l = VADD(T1j, T1k);
Chris@42 406 T1m = VFNMS(LDK(KP250000000), T1l, T1i);
Chris@42 407 T17 = VSUB(T13, T16);
Chris@42 408 T1e = VSUB(T1a, T1d);
Chris@42 409 T1f = VBYI(VFNMS(LDK(KP587785252), T1e, VMUL(LDK(KP951056516), T17)));
Chris@42 410 T1q = VBYI(VFMA(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
Chris@42 411 T1O = VADD(T1i, T1l);
Chris@42 412 STM2(&(xo[20]), T1O, ovs, &(xo[0]));
Chris@42 413 T1p = VADD(T1n, T1m);
Chris@42 414 T1P = VSUB(T1p, T1q);
Chris@42 415 STM2(&(xo[12]), T1P, ovs, &(xo[0]));
Chris@42 416 T1Q = VADD(T1q, T1p);
Chris@42 417 STM2(&(xo[28]), T1Q, ovs, &(xo[0]));
Chris@42 418 STN2(&(xo[28]), T1Q, T1I, ovs);
Chris@42 419 T1o = VSUB(T1m, T1n);
Chris@42 420 T1R = VADD(T1f, T1o);
Chris@42 421 STM2(&(xo[4]), T1R, ovs, &(xo[0]));
Chris@42 422 T1S = VSUB(T1o, T1f);
Chris@42 423 STM2(&(xo[36]), T1S, ovs, &(xo[0]));
Chris@42 424 }
Chris@42 425 {
Chris@42 426 V TI, TP, TX, TU, TM, TW, TF, TT, TK, TE;
Chris@42 427 TI = VFMA(LDK(KP951056516), TG, VMUL(LDK(KP587785252), TH));
Chris@42 428 TP = VFMA(LDK(KP951056516), TN, VMUL(LDK(KP587785252), TO));
Chris@42 429 TX = VFNMS(LDK(KP587785252), TN, VMUL(LDK(KP951056516), TO));
Chris@42 430 TU = VFNMS(LDK(KP587785252), TG, VMUL(LDK(KP951056516), TH));
Chris@42 431 TK = VFMS(LDK(KP250000000), TB, Tm);
Chris@42 432 TM = VADD(TK, TL);
Chris@42 433 TW = VSUB(TL, TK);
Chris@42 434 TE = VFNMS(LDK(KP250000000), Ti, T3);
Chris@42 435 TF = VADD(TD, TE);
Chris@42 436 TT = VSUB(TE, TD);
Chris@42 437 {
Chris@42 438 V TJ, TQ, T1T, T1U;
Chris@42 439 TJ = VADD(TF, TI);
Chris@42 440 TQ = VBYI(VSUB(TM, TP));
Chris@42 441 T1T = VSUB(TJ, TQ);
Chris@42 442 STM2(&(xo[38]), T1T, ovs, &(xo[2]));
Chris@42 443 STN2(&(xo[36]), T1S, T1T, ovs);
Chris@42 444 T1U = VADD(TJ, TQ);
Chris@42 445 STM2(&(xo[2]), T1U, ovs, &(xo[2]));
Chris@42 446 STN2(&(xo[0]), T1J, T1U, ovs);
Chris@42 447 }
Chris@42 448 {
Chris@42 449 V TZ, T10, T1V, T1W;
Chris@42 450 TZ = VADD(TT, TU);
Chris@42 451 T10 = VBYI(VADD(TX, TW));
Chris@42 452 T1V = VSUB(TZ, T10);
Chris@42 453 STM2(&(xo[26]), T1V, ovs, &(xo[2]));
Chris@42 454 STN2(&(xo[24]), T1L, T1V, ovs);
Chris@42 455 T1W = VADD(TZ, T10);
Chris@42 456 STM2(&(xo[14]), T1W, ovs, &(xo[2]));
Chris@42 457 STN2(&(xo[12]), T1P, T1W, ovs);
Chris@42 458 }
Chris@42 459 {
Chris@42 460 V TR, TS, T1X, T1Y;
Chris@42 461 TR = VSUB(TF, TI);
Chris@42 462 TS = VBYI(VADD(TP, TM));
Chris@42 463 T1X = VSUB(TR, TS);
Chris@42 464 STM2(&(xo[22]), T1X, ovs, &(xo[2]));
Chris@42 465 STN2(&(xo[20]), T1O, T1X, ovs);
Chris@42 466 T1Y = VADD(TR, TS);
Chris@42 467 STM2(&(xo[18]), T1Y, ovs, &(xo[2]));
Chris@42 468 STN2(&(xo[16]), T1K, T1Y, ovs);
Chris@42 469 }
Chris@42 470 {
Chris@42 471 V TV, TY, T1Z, T20;
Chris@42 472 TV = VSUB(TT, TU);
Chris@42 473 TY = VBYI(VSUB(TW, TX));
Chris@42 474 T1Z = VSUB(TV, TY);
Chris@42 475 STM2(&(xo[34]), T1Z, ovs, &(xo[2]));
Chris@42 476 STN2(&(xo[32]), T1N, T1Z, ovs);
Chris@42 477 T20 = VADD(TV, TY);
Chris@42 478 STM2(&(xo[6]), T20, ovs, &(xo[2]));
Chris@42 479 STN2(&(xo[4]), T1R, T20, ovs);
Chris@42 480 }
Chris@42 481 }
Chris@42 482 }
Chris@42 483 }
Chris@42 484 }
Chris@42 485 }
Chris@42 486 VLEAVE();
Chris@42 487 }
Chris@42 488
Chris@42 489 static const kdft_desc desc = { 20, XSIMD_STRING("n2fv_20"), {92, 12, 12, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 490
Chris@42 491 void XSIMD(codelet_n2fv_20) (planner *p) {
Chris@42 492 X(kdft_register) (p, n2fv_20, &desc);
Chris@42 493 }
Chris@42 494
Chris@42 495 #endif /* HAVE_FMA */