annotate src/fftw-3.3.3/dft/simd/common/n1fv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:36:54 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n1fv_20 -include n1f.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 104 FP additions, 50 FP multiplications,
Chris@10 32 * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
Chris@10 33 * 71 stack variables, 4 constants, and 40 memory accesses
Chris@10 34 */
Chris@10 35 #include "n1f.h"
Chris@10 36
Chris@10 37 static void n1fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@10 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 43 {
Chris@10 44 INT i;
Chris@10 45 const R *xi;
Chris@10 46 R *xo;
Chris@10 47 xi = ri;
Chris@10 48 xo = ro;
Chris@10 49 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@10 50 V TU, TI, TP, TX, TM, TW, TT, TF;
Chris@10 51 {
Chris@10 52 V T3, Tm, T1r, T13, Ta, TN, TH, TA, TG, Tt, Th, TO, T1u, T1C, T1n;
Chris@10 53 V T1a, T1m, T1h, T1x, T1D, TE, Ti;
Chris@10 54 {
Chris@10 55 V T1, T2, Tk, Tl;
Chris@10 56 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 57 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 58 Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 59 Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@10 60 {
Chris@10 61 V T14, T6, T1c, Tw, Tn, T1f, Tz, T17, T9, To, Tq, T1b, Td, Tr, Te;
Chris@10 62 V Tf, T15, Tp;
Chris@10 63 {
Chris@10 64 V Tx, Ty, T7, T8, Tb, Tc;
Chris@10 65 {
Chris@10 66 V T4, T5, Tu, Tv, T11, T12;
Chris@10 67 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 68 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@10 69 Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@10 70 Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 71 Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@10 72 T3 = VSUB(T1, T2);
Chris@10 73 T11 = VADD(T1, T2);
Chris@10 74 Tm = VSUB(Tk, Tl);
Chris@10 75 T12 = VADD(Tk, Tl);
Chris@10 76 T14 = VADD(T4, T5);
Chris@10 77 T6 = VSUB(T4, T5);
Chris@10 78 T1c = VADD(Tu, Tv);
Chris@10 79 Tw = VSUB(Tu, Tv);
Chris@10 80 Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 81 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@10 82 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 83 T1r = VADD(T11, T12);
Chris@10 84 T13 = VSUB(T11, T12);
Chris@10 85 }
Chris@10 86 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 87 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@10 88 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 89 T1f = VADD(Tx, Ty);
Chris@10 90 Tz = VSUB(Tx, Ty);
Chris@10 91 T17 = VADD(T7, T8);
Chris@10 92 T9 = VSUB(T7, T8);
Chris@10 93 To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@10 94 Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 95 T1b = VADD(Tb, Tc);
Chris@10 96 Td = VSUB(Tb, Tc);
Chris@10 97 Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@10 98 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@10 99 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 100 }
Chris@10 101 Ta = VADD(T6, T9);
Chris@10 102 TN = VSUB(T6, T9);
Chris@10 103 T15 = VADD(Tn, To);
Chris@10 104 Tp = VSUB(Tn, To);
Chris@10 105 TH = VSUB(Tz, Tw);
Chris@10 106 TA = VADD(Tw, Tz);
Chris@10 107 {
Chris@10 108 V T1d, T1v, T18, Ts, T1e, Tg, T16, T1s;
Chris@10 109 T1d = VSUB(T1b, T1c);
Chris@10 110 T1v = VADD(T1b, T1c);
Chris@10 111 T18 = VADD(Tq, Tr);
Chris@10 112 Ts = VSUB(Tq, Tr);
Chris@10 113 T1e = VADD(Te, Tf);
Chris@10 114 Tg = VSUB(Te, Tf);
Chris@10 115 T16 = VSUB(T14, T15);
Chris@10 116 T1s = VADD(T14, T15);
Chris@10 117 {
Chris@10 118 V T1t, T19, T1w, T1g;
Chris@10 119 T1t = VADD(T17, T18);
Chris@10 120 T19 = VSUB(T17, T18);
Chris@10 121 TG = VSUB(Ts, Tp);
Chris@10 122 Tt = VADD(Tp, Ts);
Chris@10 123 T1w = VADD(T1e, T1f);
Chris@10 124 T1g = VSUB(T1e, T1f);
Chris@10 125 Th = VADD(Td, Tg);
Chris@10 126 TO = VSUB(Td, Tg);
Chris@10 127 T1u = VADD(T1s, T1t);
Chris@10 128 T1C = VSUB(T1s, T1t);
Chris@10 129 T1n = VSUB(T16, T19);
Chris@10 130 T1a = VADD(T16, T19);
Chris@10 131 T1m = VSUB(T1d, T1g);
Chris@10 132 T1h = VADD(T1d, T1g);
Chris@10 133 T1x = VADD(T1v, T1w);
Chris@10 134 T1D = VSUB(T1v, T1w);
Chris@10 135 }
Chris@10 136 }
Chris@10 137 }
Chris@10 138 }
Chris@10 139 TE = VSUB(Ta, Th);
Chris@10 140 Ti = VADD(Ta, Th);
Chris@10 141 {
Chris@10 142 V TL, T1k, T1A, Tj, TD, T1E, T1G, TK, TC, T1j, T1z, T1i, T1y, TB;
Chris@10 143 TL = VSUB(TA, Tt);
Chris@10 144 TB = VADD(Tt, TA);
Chris@10 145 T1i = VADD(T1a, T1h);
Chris@10 146 T1k = VSUB(T1a, T1h);
Chris@10 147 T1y = VADD(T1u, T1x);
Chris@10 148 T1A = VSUB(T1u, T1x);
Chris@10 149 Tj = VADD(T3, Ti);
Chris@10 150 TD = VFNMS(LDK(KP250000000), Ti, T3);
Chris@10 151 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
Chris@10 152 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
Chris@10 153 TK = VFNMS(LDK(KP250000000), TB, Tm);
Chris@10 154 TC = VADD(Tm, TB);
Chris@10 155 T1j = VFNMS(LDK(KP250000000), T1i, T13);
Chris@10 156 ST(&(xo[0]), VADD(T1r, T1y), ovs, &(xo[0]));
Chris@10 157 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
Chris@10 158 ST(&(xo[WS(os, 10)]), VADD(T13, T1i), ovs, &(xo[0]));
Chris@10 159 {
Chris@10 160 V T1p, T1l, T1o, T1q, T1F, T1B;
Chris@10 161 TU = VFNMS(LDK(KP618033988), TG, TH);
Chris@10 162 TI = VFMA(LDK(KP618033988), TH, TG);
Chris@10 163 TP = VFMA(LDK(KP618033988), TO, TN);
Chris@10 164 TX = VFNMS(LDK(KP618033988), TN, TO);
Chris@10 165 ST(&(xo[WS(os, 15)]), VFMAI(TC, Tj), ovs, &(xo[WS(os, 1)]));
Chris@10 166 ST(&(xo[WS(os, 5)]), VFNMSI(TC, Tj), ovs, &(xo[WS(os, 1)]));
Chris@10 167 T1p = VFMA(LDK(KP559016994), T1k, T1j);
Chris@10 168 T1l = VFNMS(LDK(KP559016994), T1k, T1j);
Chris@10 169 T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
Chris@10 170 T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
Chris@10 171 T1F = VFNMS(LDK(KP559016994), T1A, T1z);
Chris@10 172 T1B = VFMA(LDK(KP559016994), T1A, T1z);
Chris@10 173 ST(&(xo[WS(os, 14)]), VFMAI(T1q, T1p), ovs, &(xo[0]));
Chris@10 174 ST(&(xo[WS(os, 6)]), VFNMSI(T1q, T1p), ovs, &(xo[0]));
Chris@10 175 ST(&(xo[WS(os, 18)]), VFNMSI(T1o, T1l), ovs, &(xo[0]));
Chris@10 176 ST(&(xo[WS(os, 2)]), VFMAI(T1o, T1l), ovs, &(xo[0]));
Chris@10 177 ST(&(xo[WS(os, 16)]), VFNMSI(T1E, T1B), ovs, &(xo[0]));
Chris@10 178 ST(&(xo[WS(os, 4)]), VFMAI(T1E, T1B), ovs, &(xo[0]));
Chris@10 179 ST(&(xo[WS(os, 12)]), VFMAI(T1G, T1F), ovs, &(xo[0]));
Chris@10 180 ST(&(xo[WS(os, 8)]), VFNMSI(T1G, T1F), ovs, &(xo[0]));
Chris@10 181 TM = VFNMS(LDK(KP559016994), TL, TK);
Chris@10 182 TW = VFMA(LDK(KP559016994), TL, TK);
Chris@10 183 TT = VFNMS(LDK(KP559016994), TE, TD);
Chris@10 184 TF = VFMA(LDK(KP559016994), TE, TD);
Chris@10 185 }
Chris@10 186 }
Chris@10 187 }
Chris@10 188 {
Chris@10 189 V T10, TY, TQ, TS, TJ, TR, TZ, TV;
Chris@10 190 T10 = VFMA(LDK(KP951056516), TX, TW);
Chris@10 191 TY = VFNMS(LDK(KP951056516), TX, TW);
Chris@10 192 TQ = VFMA(LDK(KP951056516), TP, TM);
Chris@10 193 TS = VFNMS(LDK(KP951056516), TP, TM);
Chris@10 194 TJ = VFMA(LDK(KP951056516), TI, TF);
Chris@10 195 TR = VFNMS(LDK(KP951056516), TI, TF);
Chris@10 196 TZ = VFMA(LDK(KP951056516), TU, TT);
Chris@10 197 TV = VFNMS(LDK(KP951056516), TU, TT);
Chris@10 198 ST(&(xo[WS(os, 11)]), VFMAI(TS, TR), ovs, &(xo[WS(os, 1)]));
Chris@10 199 ST(&(xo[WS(os, 9)]), VFNMSI(TS, TR), ovs, &(xo[WS(os, 1)]));
Chris@10 200 ST(&(xo[WS(os, 19)]), VFMAI(TQ, TJ), ovs, &(xo[WS(os, 1)]));
Chris@10 201 ST(&(xo[WS(os, 1)]), VFNMSI(TQ, TJ), ovs, &(xo[WS(os, 1)]));
Chris@10 202 ST(&(xo[WS(os, 3)]), VFMAI(TY, TV), ovs, &(xo[WS(os, 1)]));
Chris@10 203 ST(&(xo[WS(os, 17)]), VFNMSI(TY, TV), ovs, &(xo[WS(os, 1)]));
Chris@10 204 ST(&(xo[WS(os, 7)]), VFMAI(T10, TZ), ovs, &(xo[WS(os, 1)]));
Chris@10 205 ST(&(xo[WS(os, 13)]), VFNMSI(T10, TZ), ovs, &(xo[WS(os, 1)]));
Chris@10 206 }
Chris@10 207 }
Chris@10 208 }
Chris@10 209 VLEAVE();
Chris@10 210 }
Chris@10 211
Chris@10 212 static const kdft_desc desc = { 20, XSIMD_STRING("n1fv_20"), {58, 4, 46, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 213
Chris@10 214 void XSIMD(codelet_n1fv_20) (planner *p) {
Chris@10 215 X(kdft_register) (p, n1fv_20, &desc);
Chris@10 216 }
Chris@10 217
Chris@10 218 #else /* HAVE_FMA */
Chris@10 219
Chris@10 220 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n1fv_20 -include n1f.h */
Chris@10 221
Chris@10 222 /*
Chris@10 223 * This function contains 104 FP additions, 24 FP multiplications,
Chris@10 224 * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
Chris@10 225 * 53 stack variables, 4 constants, and 40 memory accesses
Chris@10 226 */
Chris@10 227 #include "n1f.h"
Chris@10 228
Chris@10 229 static void n1fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 230 {
Chris@10 231 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@10 232 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 233 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 234 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 235 {
Chris@10 236 INT i;
Chris@10 237 const R *xi;
Chris@10 238 R *xo;
Chris@10 239 xi = ri;
Chris@10 240 xo = ro;
Chris@10 241 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@10 242 V T3, T1B, Tm, T1i, TG, TN, TO, TH, T13, T16, T1k, T1u, T1v, T1z, T1r;
Chris@10 243 V T1s, T1y, T1a, T1d, T1j, Ti, TD, TB, TL, Tj, TC;
Chris@10 244 {
Chris@10 245 V T1, T2, T1g, Tk, Tl, T1h;
Chris@10 246 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 247 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 248 T1g = VADD(T1, T2);
Chris@10 249 Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 250 Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@10 251 T1h = VADD(Tk, Tl);
Chris@10 252 T3 = VSUB(T1, T2);
Chris@10 253 T1B = VADD(T1g, T1h);
Chris@10 254 Tm = VSUB(Tk, Tl);
Chris@10 255 T1i = VSUB(T1g, T1h);
Chris@10 256 }
Chris@10 257 {
Chris@10 258 V T6, T18, Tw, T12, Tz, T15, T9, T1b, Td, T11, Tp, T19, Ts, T1c, Tg;
Chris@10 259 V T14;
Chris@10 260 {
Chris@10 261 V T4, T5, Tu, Tv;
Chris@10 262 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 263 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@10 264 T6 = VSUB(T4, T5);
Chris@10 265 T18 = VADD(T4, T5);
Chris@10 266 Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@10 267 Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 268 Tw = VSUB(Tu, Tv);
Chris@10 269 T12 = VADD(Tu, Tv);
Chris@10 270 }
Chris@10 271 {
Chris@10 272 V Tx, Ty, T7, T8;
Chris@10 273 Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@10 274 Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 275 Tz = VSUB(Tx, Ty);
Chris@10 276 T15 = VADD(Tx, Ty);
Chris@10 277 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@10 278 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 279 T9 = VSUB(T7, T8);
Chris@10 280 T1b = VADD(T7, T8);
Chris@10 281 }
Chris@10 282 {
Chris@10 283 V Tb, Tc, Tn, To;
Chris@10 284 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 285 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@10 286 Td = VSUB(Tb, Tc);
Chris@10 287 T11 = VADD(Tb, Tc);
Chris@10 288 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 289 To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@10 290 Tp = VSUB(Tn, To);
Chris@10 291 T19 = VADD(Tn, To);
Chris@10 292 }
Chris@10 293 {
Chris@10 294 V Tq, Tr, Te, Tf;
Chris@10 295 Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 296 Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@10 297 Ts = VSUB(Tq, Tr);
Chris@10 298 T1c = VADD(Tq, Tr);
Chris@10 299 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@10 300 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 301 Tg = VSUB(Te, Tf);
Chris@10 302 T14 = VADD(Te, Tf);
Chris@10 303 }
Chris@10 304 TG = VSUB(Ts, Tp);
Chris@10 305 TN = VSUB(T6, T9);
Chris@10 306 TO = VSUB(Td, Tg);
Chris@10 307 TH = VSUB(Tz, Tw);
Chris@10 308 T13 = VSUB(T11, T12);
Chris@10 309 T16 = VSUB(T14, T15);
Chris@10 310 T1k = VADD(T13, T16);
Chris@10 311 T1u = VADD(T11, T12);
Chris@10 312 T1v = VADD(T14, T15);
Chris@10 313 T1z = VADD(T1u, T1v);
Chris@10 314 T1r = VADD(T18, T19);
Chris@10 315 T1s = VADD(T1b, T1c);
Chris@10 316 T1y = VADD(T1r, T1s);
Chris@10 317 T1a = VSUB(T18, T19);
Chris@10 318 T1d = VSUB(T1b, T1c);
Chris@10 319 T1j = VADD(T1a, T1d);
Chris@10 320 {
Chris@10 321 V Ta, Th, Tt, TA;
Chris@10 322 Ta = VADD(T6, T9);
Chris@10 323 Th = VADD(Td, Tg);
Chris@10 324 Ti = VADD(Ta, Th);
Chris@10 325 TD = VMUL(LDK(KP559016994), VSUB(Ta, Th));
Chris@10 326 Tt = VADD(Tp, Ts);
Chris@10 327 TA = VADD(Tw, Tz);
Chris@10 328 TB = VADD(Tt, TA);
Chris@10 329 TL = VMUL(LDK(KP559016994), VSUB(TA, Tt));
Chris@10 330 }
Chris@10 331 }
Chris@10 332 Tj = VADD(T3, Ti);
Chris@10 333 TC = VBYI(VADD(Tm, TB));
Chris@10 334 ST(&(xo[WS(os, 5)]), VSUB(Tj, TC), ovs, &(xo[WS(os, 1)]));
Chris@10 335 ST(&(xo[WS(os, 15)]), VADD(Tj, TC), ovs, &(xo[WS(os, 1)]));
Chris@10 336 {
Chris@10 337 V T1A, T1C, T1D, T1x, T1G, T1t, T1w, T1F, T1E;
Chris@10 338 T1A = VMUL(LDK(KP559016994), VSUB(T1y, T1z));
Chris@10 339 T1C = VADD(T1y, T1z);
Chris@10 340 T1D = VFNMS(LDK(KP250000000), T1C, T1B);
Chris@10 341 T1t = VSUB(T1r, T1s);
Chris@10 342 T1w = VSUB(T1u, T1v);
Chris@10 343 T1x = VBYI(VFMA(LDK(KP951056516), T1t, VMUL(LDK(KP587785252), T1w)));
Chris@10 344 T1G = VBYI(VFNMS(LDK(KP587785252), T1t, VMUL(LDK(KP951056516), T1w)));
Chris@10 345 ST(&(xo[0]), VADD(T1B, T1C), ovs, &(xo[0]));
Chris@10 346 T1F = VSUB(T1D, T1A);
Chris@10 347 ST(&(xo[WS(os, 8)]), VSUB(T1F, T1G), ovs, &(xo[0]));
Chris@10 348 ST(&(xo[WS(os, 12)]), VADD(T1G, T1F), ovs, &(xo[0]));
Chris@10 349 T1E = VADD(T1A, T1D);
Chris@10 350 ST(&(xo[WS(os, 4)]), VADD(T1x, T1E), ovs, &(xo[0]));
Chris@10 351 ST(&(xo[WS(os, 16)]), VSUB(T1E, T1x), ovs, &(xo[0]));
Chris@10 352 }
Chris@10 353 {
Chris@10 354 V T1n, T1l, T1m, T1f, T1q, T17, T1e, T1p, T1o;
Chris@10 355 T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
Chris@10 356 T1l = VADD(T1j, T1k);
Chris@10 357 T1m = VFNMS(LDK(KP250000000), T1l, T1i);
Chris@10 358 T17 = VSUB(T13, T16);
Chris@10 359 T1e = VSUB(T1a, T1d);
Chris@10 360 T1f = VBYI(VFNMS(LDK(KP587785252), T1e, VMUL(LDK(KP951056516), T17)));
Chris@10 361 T1q = VBYI(VFMA(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
Chris@10 362 ST(&(xo[WS(os, 10)]), VADD(T1i, T1l), ovs, &(xo[0]));
Chris@10 363 T1p = VADD(T1n, T1m);
Chris@10 364 ST(&(xo[WS(os, 6)]), VSUB(T1p, T1q), ovs, &(xo[0]));
Chris@10 365 ST(&(xo[WS(os, 14)]), VADD(T1q, T1p), ovs, &(xo[0]));
Chris@10 366 T1o = VSUB(T1m, T1n);
Chris@10 367 ST(&(xo[WS(os, 2)]), VADD(T1f, T1o), ovs, &(xo[0]));
Chris@10 368 ST(&(xo[WS(os, 18)]), VSUB(T1o, T1f), ovs, &(xo[0]));
Chris@10 369 }
Chris@10 370 {
Chris@10 371 V TI, TP, TX, TU, TM, TW, TF, TT, TK, TE;
Chris@10 372 TI = VFMA(LDK(KP951056516), TG, VMUL(LDK(KP587785252), TH));
Chris@10 373 TP = VFMA(LDK(KP951056516), TN, VMUL(LDK(KP587785252), TO));
Chris@10 374 TX = VFNMS(LDK(KP587785252), TN, VMUL(LDK(KP951056516), TO));
Chris@10 375 TU = VFNMS(LDK(KP587785252), TG, VMUL(LDK(KP951056516), TH));
Chris@10 376 TK = VFMS(LDK(KP250000000), TB, Tm);
Chris@10 377 TM = VADD(TK, TL);
Chris@10 378 TW = VSUB(TL, TK);
Chris@10 379 TE = VFNMS(LDK(KP250000000), Ti, T3);
Chris@10 380 TF = VADD(TD, TE);
Chris@10 381 TT = VSUB(TE, TD);
Chris@10 382 {
Chris@10 383 V TJ, TQ, TZ, T10;
Chris@10 384 TJ = VADD(TF, TI);
Chris@10 385 TQ = VBYI(VSUB(TM, TP));
Chris@10 386 ST(&(xo[WS(os, 19)]), VSUB(TJ, TQ), ovs, &(xo[WS(os, 1)]));
Chris@10 387 ST(&(xo[WS(os, 1)]), VADD(TJ, TQ), ovs, &(xo[WS(os, 1)]));
Chris@10 388 TZ = VADD(TT, TU);
Chris@10 389 T10 = VBYI(VADD(TX, TW));
Chris@10 390 ST(&(xo[WS(os, 13)]), VSUB(TZ, T10), ovs, &(xo[WS(os, 1)]));
Chris@10 391 ST(&(xo[WS(os, 7)]), VADD(TZ, T10), ovs, &(xo[WS(os, 1)]));
Chris@10 392 }
Chris@10 393 {
Chris@10 394 V TR, TS, TV, TY;
Chris@10 395 TR = VSUB(TF, TI);
Chris@10 396 TS = VBYI(VADD(TP, TM));
Chris@10 397 ST(&(xo[WS(os, 11)]), VSUB(TR, TS), ovs, &(xo[WS(os, 1)]));
Chris@10 398 ST(&(xo[WS(os, 9)]), VADD(TR, TS), ovs, &(xo[WS(os, 1)]));
Chris@10 399 TV = VSUB(TT, TU);
Chris@10 400 TY = VBYI(VSUB(TW, TX));
Chris@10 401 ST(&(xo[WS(os, 17)]), VSUB(TV, TY), ovs, &(xo[WS(os, 1)]));
Chris@10 402 ST(&(xo[WS(os, 3)]), VADD(TV, TY), ovs, &(xo[WS(os, 1)]));
Chris@10 403 }
Chris@10 404 }
Chris@10 405 }
Chris@10 406 }
Chris@10 407 VLEAVE();
Chris@10 408 }
Chris@10 409
Chris@10 410 static const kdft_desc desc = { 20, XSIMD_STRING("n1fv_20"), {92, 12, 12, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 411
Chris@10 412 void XSIMD(codelet_n1fv_20) (planner *p) {
Chris@10 413 X(kdft_register) (p, n1fv_20, &desc);
Chris@10 414 }
Chris@10 415
Chris@10 416 #endif /* HAVE_FMA */