annotate src/fftw-3.3.5/dft/simd/common/n1bv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:39:46 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n1bv_20 -include n1b.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 104 FP additions, 50 FP multiplications,
Chris@42 32 * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
Chris@42 33 * 71 stack variables, 4 constants, and 40 memory accesses
Chris@42 34 */
Chris@42 35 #include "n1b.h"
Chris@42 36
Chris@42 37 static void n1bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 43 {
Chris@42 44 INT i;
Chris@42 45 const R *xi;
Chris@42 46 R *xo;
Chris@42 47 xi = ii;
Chris@42 48 xo = io;
Chris@42 49 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@42 50 V TS, TA, TN, TV, TK, TU, TR, Tl;
Chris@42 51 {
Chris@42 52 V T3, TE, T1r, T13, Ta, TL, Tz, TG, Ts, TF, Th, TM, T1u, T1C, T1n;
Chris@42 53 V T1a, T1m, T1h, T1x, T1D, Tk, Ti;
Chris@42 54 {
Chris@42 55 V T1, T2, TC, TD;
Chris@42 56 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 57 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 58 TC = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 59 TD = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 60 {
Chris@42 61 V T14, T6, T1c, Tv, Tm, T1f, Ty, T17, T9, Tn, Tp, T1b, Td, Tq, Te;
Chris@42 62 V Tf, T15, To;
Chris@42 63 {
Chris@42 64 V Tw, Tx, T7, T8, Tb, Tc;
Chris@42 65 {
Chris@42 66 V T4, T5, Tt, Tu, T11, T12;
Chris@42 67 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 68 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 69 Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 70 Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 71 Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 72 T3 = VSUB(T1, T2);
Chris@42 73 T11 = VADD(T1, T2);
Chris@42 74 TE = VSUB(TC, TD);
Chris@42 75 T12 = VADD(TC, TD);
Chris@42 76 T14 = VADD(T4, T5);
Chris@42 77 T6 = VSUB(T4, T5);
Chris@42 78 T1c = VADD(Tt, Tu);
Chris@42 79 Tv = VSUB(Tt, Tu);
Chris@42 80 Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 81 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 82 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 83 T1r = VADD(T11, T12);
Chris@42 84 T13 = VSUB(T11, T12);
Chris@42 85 }
Chris@42 86 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 87 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 88 Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 89 T1f = VADD(Tw, Tx);
Chris@42 90 Ty = VSUB(Tw, Tx);
Chris@42 91 T17 = VADD(T7, T8);
Chris@42 92 T9 = VSUB(T7, T8);
Chris@42 93 Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 94 Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 95 T1b = VADD(Tb, Tc);
Chris@42 96 Td = VSUB(Tb, Tc);
Chris@42 97 Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 98 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 99 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 100 }
Chris@42 101 Ta = VADD(T6, T9);
Chris@42 102 TL = VSUB(T6, T9);
Chris@42 103 T15 = VADD(Tm, Tn);
Chris@42 104 To = VSUB(Tm, Tn);
Chris@42 105 Tz = VSUB(Tv, Ty);
Chris@42 106 TG = VADD(Tv, Ty);
Chris@42 107 {
Chris@42 108 V T1d, T1v, T18, Tr, T1e, Tg, T16, T1s;
Chris@42 109 T1d = VSUB(T1b, T1c);
Chris@42 110 T1v = VADD(T1b, T1c);
Chris@42 111 T18 = VADD(Tp, Tq);
Chris@42 112 Tr = VSUB(Tp, Tq);
Chris@42 113 T1e = VADD(Te, Tf);
Chris@42 114 Tg = VSUB(Te, Tf);
Chris@42 115 T16 = VSUB(T14, T15);
Chris@42 116 T1s = VADD(T14, T15);
Chris@42 117 {
Chris@42 118 V T1t, T19, T1w, T1g;
Chris@42 119 T1t = VADD(T17, T18);
Chris@42 120 T19 = VSUB(T17, T18);
Chris@42 121 Ts = VSUB(To, Tr);
Chris@42 122 TF = VADD(To, Tr);
Chris@42 123 T1w = VADD(T1e, T1f);
Chris@42 124 T1g = VSUB(T1e, T1f);
Chris@42 125 Th = VADD(Td, Tg);
Chris@42 126 TM = VSUB(Td, Tg);
Chris@42 127 T1u = VADD(T1s, T1t);
Chris@42 128 T1C = VSUB(T1s, T1t);
Chris@42 129 T1n = VSUB(T16, T19);
Chris@42 130 T1a = VADD(T16, T19);
Chris@42 131 T1m = VSUB(T1d, T1g);
Chris@42 132 T1h = VADD(T1d, T1g);
Chris@42 133 T1x = VADD(T1v, T1w);
Chris@42 134 T1D = VSUB(T1v, T1w);
Chris@42 135 }
Chris@42 136 }
Chris@42 137 }
Chris@42 138 }
Chris@42 139 Tk = VSUB(Ta, Th);
Chris@42 140 Ti = VADD(Ta, Th);
Chris@42 141 {
Chris@42 142 V TJ, T1k, T1A, TZ, Tj, T1E, T1G, TI, T10, T1j, T1z, T1i, T1y, TH;
Chris@42 143 TJ = VSUB(TF, TG);
Chris@42 144 TH = VADD(TF, TG);
Chris@42 145 T1i = VADD(T1a, T1h);
Chris@42 146 T1k = VSUB(T1a, T1h);
Chris@42 147 T1y = VADD(T1u, T1x);
Chris@42 148 T1A = VSUB(T1u, T1x);
Chris@42 149 TZ = VADD(T3, Ti);
Chris@42 150 Tj = VFNMS(LDK(KP250000000), Ti, T3);
Chris@42 151 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
Chris@42 152 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
Chris@42 153 TI = VFNMS(LDK(KP250000000), TH, TE);
Chris@42 154 T10 = VADD(TE, TH);
Chris@42 155 T1j = VFNMS(LDK(KP250000000), T1i, T13);
Chris@42 156 ST(&(xo[0]), VADD(T1r, T1y), ovs, &(xo[0]));
Chris@42 157 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
Chris@42 158 ST(&(xo[WS(os, 10)]), VADD(T13, T1i), ovs, &(xo[0]));
Chris@42 159 {
Chris@42 160 V T1p, T1l, T1o, T1q, T1F, T1B;
Chris@42 161 TS = VFNMS(LDK(KP618033988), Ts, Tz);
Chris@42 162 TA = VFMA(LDK(KP618033988), Tz, Ts);
Chris@42 163 TN = VFMA(LDK(KP618033988), TM, TL);
Chris@42 164 TV = VFNMS(LDK(KP618033988), TL, TM);
Chris@42 165 ST(&(xo[WS(os, 5)]), VFMAI(T10, TZ), ovs, &(xo[WS(os, 1)]));
Chris@42 166 ST(&(xo[WS(os, 15)]), VFNMSI(T10, TZ), ovs, &(xo[WS(os, 1)]));
Chris@42 167 T1p = VFMA(LDK(KP559016994), T1k, T1j);
Chris@42 168 T1l = VFNMS(LDK(KP559016994), T1k, T1j);
Chris@42 169 T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
Chris@42 170 T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
Chris@42 171 T1F = VFNMS(LDK(KP559016994), T1A, T1z);
Chris@42 172 T1B = VFMA(LDK(KP559016994), T1A, T1z);
Chris@42 173 ST(&(xo[WS(os, 14)]), VFNMSI(T1q, T1p), ovs, &(xo[0]));
Chris@42 174 ST(&(xo[WS(os, 6)]), VFMAI(T1q, T1p), ovs, &(xo[0]));
Chris@42 175 ST(&(xo[WS(os, 18)]), VFMAI(T1o, T1l), ovs, &(xo[0]));
Chris@42 176 ST(&(xo[WS(os, 2)]), VFNMSI(T1o, T1l), ovs, &(xo[0]));
Chris@42 177 ST(&(xo[WS(os, 16)]), VFMAI(T1E, T1B), ovs, &(xo[0]));
Chris@42 178 ST(&(xo[WS(os, 4)]), VFNMSI(T1E, T1B), ovs, &(xo[0]));
Chris@42 179 ST(&(xo[WS(os, 12)]), VFNMSI(T1G, T1F), ovs, &(xo[0]));
Chris@42 180 ST(&(xo[WS(os, 8)]), VFMAI(T1G, T1F), ovs, &(xo[0]));
Chris@42 181 TK = VFMA(LDK(KP559016994), TJ, TI);
Chris@42 182 TU = VFNMS(LDK(KP559016994), TJ, TI);
Chris@42 183 TR = VFNMS(LDK(KP559016994), Tk, Tj);
Chris@42 184 Tl = VFMA(LDK(KP559016994), Tk, Tj);
Chris@42 185 }
Chris@42 186 }
Chris@42 187 }
Chris@42 188 {
Chris@42 189 V TY, TW, TO, TQ, TB, TP, TX, TT;
Chris@42 190 TY = VFMA(LDK(KP951056516), TV, TU);
Chris@42 191 TW = VFNMS(LDK(KP951056516), TV, TU);
Chris@42 192 TO = VFMA(LDK(KP951056516), TN, TK);
Chris@42 193 TQ = VFNMS(LDK(KP951056516), TN, TK);
Chris@42 194 TB = VFNMS(LDK(KP951056516), TA, Tl);
Chris@42 195 TP = VFMA(LDK(KP951056516), TA, Tl);
Chris@42 196 TX = VFNMS(LDK(KP951056516), TS, TR);
Chris@42 197 TT = VFMA(LDK(KP951056516), TS, TR);
Chris@42 198 ST(&(xo[WS(os, 9)]), VFMAI(TQ, TP), ovs, &(xo[WS(os, 1)]));
Chris@42 199 ST(&(xo[WS(os, 11)]), VFNMSI(TQ, TP), ovs, &(xo[WS(os, 1)]));
Chris@42 200 ST(&(xo[WS(os, 1)]), VFMAI(TO, TB), ovs, &(xo[WS(os, 1)]));
Chris@42 201 ST(&(xo[WS(os, 19)]), VFNMSI(TO, TB), ovs, &(xo[WS(os, 1)]));
Chris@42 202 ST(&(xo[WS(os, 17)]), VFMAI(TW, TT), ovs, &(xo[WS(os, 1)]));
Chris@42 203 ST(&(xo[WS(os, 3)]), VFNMSI(TW, TT), ovs, &(xo[WS(os, 1)]));
Chris@42 204 ST(&(xo[WS(os, 13)]), VFMAI(TY, TX), ovs, &(xo[WS(os, 1)]));
Chris@42 205 ST(&(xo[WS(os, 7)]), VFNMSI(TY, TX), ovs, &(xo[WS(os, 1)]));
Chris@42 206 }
Chris@42 207 }
Chris@42 208 }
Chris@42 209 VLEAVE();
Chris@42 210 }
Chris@42 211
Chris@42 212 static const kdft_desc desc = { 20, XSIMD_STRING("n1bv_20"), {58, 4, 46, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 213
Chris@42 214 void XSIMD(codelet_n1bv_20) (planner *p) {
Chris@42 215 X(kdft_register) (p, n1bv_20, &desc);
Chris@42 216 }
Chris@42 217
Chris@42 218 #else /* HAVE_FMA */
Chris@42 219
Chris@42 220 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n1bv_20 -include n1b.h */
Chris@42 221
Chris@42 222 /*
Chris@42 223 * This function contains 104 FP additions, 24 FP multiplications,
Chris@42 224 * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
Chris@42 225 * 53 stack variables, 4 constants, and 40 memory accesses
Chris@42 226 */
Chris@42 227 #include "n1b.h"
Chris@42 228
Chris@42 229 static void n1bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 230 {
Chris@42 231 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 232 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 233 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 234 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 235 {
Chris@42 236 INT i;
Chris@42 237 const R *xi;
Chris@42 238 R *xo;
Chris@42 239 xi = ii;
Chris@42 240 xo = io;
Chris@42 241 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@42 242 V T3, T1y, TH, T1i, Ts, TL, TM, Tz, T13, T16, T1j, T1u, T1v, T1w, T1r;
Chris@42 243 V T1s, T1t, T1a, T1d, T1k, Ti, Tk, TE, TI, TZ, T10;
Chris@42 244 {
Chris@42 245 V T1, T2, T1g, TF, TG, T1h;
Chris@42 246 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 247 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 248 T1g = VADD(T1, T2);
Chris@42 249 TF = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 250 TG = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 251 T1h = VADD(TF, TG);
Chris@42 252 T3 = VSUB(T1, T2);
Chris@42 253 T1y = VADD(T1g, T1h);
Chris@42 254 TH = VSUB(TF, TG);
Chris@42 255 T1i = VSUB(T1g, T1h);
Chris@42 256 }
Chris@42 257 {
Chris@42 258 V T6, T11, Tv, T19, Ty, T1c, T9, T14, Td, T18, To, T12, Tr, T15, Tg;
Chris@42 259 V T1b;
Chris@42 260 {
Chris@42 261 V T4, T5, Tt, Tu;
Chris@42 262 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 263 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 264 T6 = VSUB(T4, T5);
Chris@42 265 T11 = VADD(T4, T5);
Chris@42 266 Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 267 Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 268 Tv = VSUB(Tt, Tu);
Chris@42 269 T19 = VADD(Tt, Tu);
Chris@42 270 }
Chris@42 271 {
Chris@42 272 V Tw, Tx, T7, T8;
Chris@42 273 Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 274 Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 275 Ty = VSUB(Tw, Tx);
Chris@42 276 T1c = VADD(Tw, Tx);
Chris@42 277 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 278 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 279 T9 = VSUB(T7, T8);
Chris@42 280 T14 = VADD(T7, T8);
Chris@42 281 }
Chris@42 282 {
Chris@42 283 V Tb, Tc, Tm, Tn;
Chris@42 284 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 285 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 286 Td = VSUB(Tb, Tc);
Chris@42 287 T18 = VADD(Tb, Tc);
Chris@42 288 Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 289 Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 290 To = VSUB(Tm, Tn);
Chris@42 291 T12 = VADD(Tm, Tn);
Chris@42 292 }
Chris@42 293 {
Chris@42 294 V Tp, Tq, Te, Tf;
Chris@42 295 Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 296 Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 297 Tr = VSUB(Tp, Tq);
Chris@42 298 T15 = VADD(Tp, Tq);
Chris@42 299 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 300 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 301 Tg = VSUB(Te, Tf);
Chris@42 302 T1b = VADD(Te, Tf);
Chris@42 303 }
Chris@42 304 Ts = VSUB(To, Tr);
Chris@42 305 TL = VSUB(T6, T9);
Chris@42 306 TM = VSUB(Td, Tg);
Chris@42 307 Tz = VSUB(Tv, Ty);
Chris@42 308 T13 = VSUB(T11, T12);
Chris@42 309 T16 = VSUB(T14, T15);
Chris@42 310 T1j = VADD(T13, T16);
Chris@42 311 T1u = VADD(T18, T19);
Chris@42 312 T1v = VADD(T1b, T1c);
Chris@42 313 T1w = VADD(T1u, T1v);
Chris@42 314 T1r = VADD(T11, T12);
Chris@42 315 T1s = VADD(T14, T15);
Chris@42 316 T1t = VADD(T1r, T1s);
Chris@42 317 T1a = VSUB(T18, T19);
Chris@42 318 T1d = VSUB(T1b, T1c);
Chris@42 319 T1k = VADD(T1a, T1d);
Chris@42 320 {
Chris@42 321 V Ta, Th, TC, TD;
Chris@42 322 Ta = VADD(T6, T9);
Chris@42 323 Th = VADD(Td, Tg);
Chris@42 324 Ti = VADD(Ta, Th);
Chris@42 325 Tk = VMUL(LDK(KP559016994), VSUB(Ta, Th));
Chris@42 326 TC = VADD(To, Tr);
Chris@42 327 TD = VADD(Tv, Ty);
Chris@42 328 TE = VMUL(LDK(KP559016994), VSUB(TC, TD));
Chris@42 329 TI = VADD(TC, TD);
Chris@42 330 }
Chris@42 331 }
Chris@42 332 TZ = VADD(T3, Ti);
Chris@42 333 T10 = VBYI(VADD(TH, TI));
Chris@42 334 ST(&(xo[WS(os, 15)]), VSUB(TZ, T10), ovs, &(xo[WS(os, 1)]));
Chris@42 335 ST(&(xo[WS(os, 5)]), VADD(TZ, T10), ovs, &(xo[WS(os, 1)]));
Chris@42 336 {
Chris@42 337 V T1x, T1z, T1A, T1E, T1G, T1C, T1D, T1F, T1B;
Chris@42 338 T1x = VMUL(LDK(KP559016994), VSUB(T1t, T1w));
Chris@42 339 T1z = VADD(T1t, T1w);
Chris@42 340 T1A = VFNMS(LDK(KP250000000), T1z, T1y);
Chris@42 341 T1C = VSUB(T1r, T1s);
Chris@42 342 T1D = VSUB(T1u, T1v);
Chris@42 343 T1E = VBYI(VFMA(LDK(KP951056516), T1C, VMUL(LDK(KP587785252), T1D)));
Chris@42 344 T1G = VBYI(VFNMS(LDK(KP951056516), T1D, VMUL(LDK(KP587785252), T1C)));
Chris@42 345 ST(&(xo[0]), VADD(T1y, T1z), ovs, &(xo[0]));
Chris@42 346 T1F = VSUB(T1A, T1x);
Chris@42 347 ST(&(xo[WS(os, 8)]), VSUB(T1F, T1G), ovs, &(xo[0]));
Chris@42 348 ST(&(xo[WS(os, 12)]), VADD(T1G, T1F), ovs, &(xo[0]));
Chris@42 349 T1B = VADD(T1x, T1A);
Chris@42 350 ST(&(xo[WS(os, 4)]), VSUB(T1B, T1E), ovs, &(xo[0]));
Chris@42 351 ST(&(xo[WS(os, 16)]), VADD(T1E, T1B), ovs, &(xo[0]));
Chris@42 352 }
Chris@42 353 {
Chris@42 354 V T1n, T1l, T1m, T1f, T1p, T17, T1e, T1q, T1o;
Chris@42 355 T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
Chris@42 356 T1l = VADD(T1j, T1k);
Chris@42 357 T1m = VFNMS(LDK(KP250000000), T1l, T1i);
Chris@42 358 T17 = VSUB(T13, T16);
Chris@42 359 T1e = VSUB(T1a, T1d);
Chris@42 360 T1f = VBYI(VFNMS(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
Chris@42 361 T1p = VBYI(VFMA(LDK(KP951056516), T17, VMUL(LDK(KP587785252), T1e)));
Chris@42 362 ST(&(xo[WS(os, 10)]), VADD(T1i, T1l), ovs, &(xo[0]));
Chris@42 363 T1q = VADD(T1n, T1m);
Chris@42 364 ST(&(xo[WS(os, 6)]), VADD(T1p, T1q), ovs, &(xo[0]));
Chris@42 365 ST(&(xo[WS(os, 14)]), VSUB(T1q, T1p), ovs, &(xo[0]));
Chris@42 366 T1o = VSUB(T1m, T1n);
Chris@42 367 ST(&(xo[WS(os, 2)]), VADD(T1f, T1o), ovs, &(xo[0]));
Chris@42 368 ST(&(xo[WS(os, 18)]), VSUB(T1o, T1f), ovs, &(xo[0]));
Chris@42 369 }
Chris@42 370 {
Chris@42 371 V TA, TN, TU, TS, TK, TV, Tl, TR, TJ, Tj;
Chris@42 372 TA = VFNMS(LDK(KP951056516), Tz, VMUL(LDK(KP587785252), Ts));
Chris@42 373 TN = VFNMS(LDK(KP951056516), TM, VMUL(LDK(KP587785252), TL));
Chris@42 374 TU = VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TM));
Chris@42 375 TS = VFMA(LDK(KP951056516), Ts, VMUL(LDK(KP587785252), Tz));
Chris@42 376 TJ = VFNMS(LDK(KP250000000), TI, TH);
Chris@42 377 TK = VSUB(TE, TJ);
Chris@42 378 TV = VADD(TE, TJ);
Chris@42 379 Tj = VFNMS(LDK(KP250000000), Ti, T3);
Chris@42 380 Tl = VSUB(Tj, Tk);
Chris@42 381 TR = VADD(Tk, Tj);
Chris@42 382 {
Chris@42 383 V TB, TO, TX, TY;
Chris@42 384 TB = VSUB(Tl, TA);
Chris@42 385 TO = VBYI(VSUB(TK, TN));
Chris@42 386 ST(&(xo[WS(os, 17)]), VSUB(TB, TO), ovs, &(xo[WS(os, 1)]));
Chris@42 387 ST(&(xo[WS(os, 3)]), VADD(TB, TO), ovs, &(xo[WS(os, 1)]));
Chris@42 388 TX = VADD(TR, TS);
Chris@42 389 TY = VBYI(VSUB(TV, TU));
Chris@42 390 ST(&(xo[WS(os, 11)]), VSUB(TX, TY), ovs, &(xo[WS(os, 1)]));
Chris@42 391 ST(&(xo[WS(os, 9)]), VADD(TX, TY), ovs, &(xo[WS(os, 1)]));
Chris@42 392 }
Chris@42 393 {
Chris@42 394 V TP, TQ, TT, TW;
Chris@42 395 TP = VADD(Tl, TA);
Chris@42 396 TQ = VBYI(VADD(TN, TK));
Chris@42 397 ST(&(xo[WS(os, 13)]), VSUB(TP, TQ), ovs, &(xo[WS(os, 1)]));
Chris@42 398 ST(&(xo[WS(os, 7)]), VADD(TP, TQ), ovs, &(xo[WS(os, 1)]));
Chris@42 399 TT = VSUB(TR, TS);
Chris@42 400 TW = VBYI(VADD(TU, TV));
Chris@42 401 ST(&(xo[WS(os, 19)]), VSUB(TT, TW), ovs, &(xo[WS(os, 1)]));
Chris@42 402 ST(&(xo[WS(os, 1)]), VADD(TT, TW), ovs, &(xo[WS(os, 1)]));
Chris@42 403 }
Chris@42 404 }
Chris@42 405 }
Chris@42 406 }
Chris@42 407 VLEAVE();
Chris@42 408 }
Chris@42 409
Chris@42 410 static const kdft_desc desc = { 20, XSIMD_STRING("n1bv_20"), {92, 12, 12, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 411
Chris@42 412 void XSIMD(codelet_n1bv_20) (planner *p) {
Chris@42 413 X(kdft_register) (p, n1bv_20, &desc);
Chris@42 414 }
Chris@42 415
Chris@42 416 #endif /* HAVE_FMA */