annotate src/fftw-3.3.8/dft/simd/common/n1bv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:02 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n1bv_20 -include dft/simd/n1b.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 104 FP additions, 50 FP multiplications,
Chris@82 32 * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
Chris@82 33 * 53 stack variables, 4 constants, and 40 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n1b.h"
Chris@82 36
Chris@82 37 static void n1bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT i;
Chris@82 45 const R *xi;
Chris@82 46 R *xo;
Chris@82 47 xi = ii;
Chris@82 48 xo = io;
Chris@82 49 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@82 50 V T3, T1r, TE, T13, Ts, TL, TM, Tz, T16, T19, T1a, T1v, T1w, T1x, T1s;
Chris@82 51 V T1t, T1u, T1d, T1g, T1h, Ti, Tk, TH, TJ, TZ, T10;
Chris@82 52 {
Chris@82 53 V T1, T2, T11, TC, TD, T12;
Chris@82 54 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 55 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 56 T11 = VADD(T1, T2);
Chris@82 57 TC = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 58 TD = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 59 T12 = VADD(TC, TD);
Chris@82 60 T3 = VSUB(T1, T2);
Chris@82 61 T1r = VADD(T11, T12);
Chris@82 62 TE = VSUB(TC, TD);
Chris@82 63 T13 = VSUB(T11, T12);
Chris@82 64 }
Chris@82 65 {
Chris@82 66 V T6, T14, Tv, T1c, Ty, T1f, T9, T17, Td, T1b, To, T15, Tr, T18, Tg;
Chris@82 67 V T1e;
Chris@82 68 {
Chris@82 69 V T4, T5, Tt, Tu;
Chris@82 70 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 71 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 72 T6 = VSUB(T4, T5);
Chris@82 73 T14 = VADD(T4, T5);
Chris@82 74 Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 75 Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 76 Tv = VSUB(Tt, Tu);
Chris@82 77 T1c = VADD(Tt, Tu);
Chris@82 78 }
Chris@82 79 {
Chris@82 80 V Tw, Tx, T7, T8;
Chris@82 81 Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 82 Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 83 Ty = VSUB(Tw, Tx);
Chris@82 84 T1f = VADD(Tw, Tx);
Chris@82 85 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 86 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 87 T9 = VSUB(T7, T8);
Chris@82 88 T17 = VADD(T7, T8);
Chris@82 89 }
Chris@82 90 {
Chris@82 91 V Tb, Tc, Tm, Tn;
Chris@82 92 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 93 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 94 Td = VSUB(Tb, Tc);
Chris@82 95 T1b = VADD(Tb, Tc);
Chris@82 96 Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 97 Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 98 To = VSUB(Tm, Tn);
Chris@82 99 T15 = VADD(Tm, Tn);
Chris@82 100 }
Chris@82 101 {
Chris@82 102 V Tp, Tq, Te, Tf;
Chris@82 103 Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 104 Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 105 Tr = VSUB(Tp, Tq);
Chris@82 106 T18 = VADD(Tp, Tq);
Chris@82 107 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 108 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 109 Tg = VSUB(Te, Tf);
Chris@82 110 T1e = VADD(Te, Tf);
Chris@82 111 }
Chris@82 112 Ts = VSUB(To, Tr);
Chris@82 113 TL = VSUB(T6, T9);
Chris@82 114 TM = VSUB(Td, Tg);
Chris@82 115 Tz = VSUB(Tv, Ty);
Chris@82 116 T16 = VSUB(T14, T15);
Chris@82 117 T19 = VSUB(T17, T18);
Chris@82 118 T1a = VADD(T16, T19);
Chris@82 119 T1v = VADD(T1b, T1c);
Chris@82 120 T1w = VADD(T1e, T1f);
Chris@82 121 T1x = VADD(T1v, T1w);
Chris@82 122 T1s = VADD(T14, T15);
Chris@82 123 T1t = VADD(T17, T18);
Chris@82 124 T1u = VADD(T1s, T1t);
Chris@82 125 T1d = VSUB(T1b, T1c);
Chris@82 126 T1g = VSUB(T1e, T1f);
Chris@82 127 T1h = VADD(T1d, T1g);
Chris@82 128 {
Chris@82 129 V Ta, Th, TF, TG;
Chris@82 130 Ta = VADD(T6, T9);
Chris@82 131 Th = VADD(Td, Tg);
Chris@82 132 Ti = VADD(Ta, Th);
Chris@82 133 Tk = VSUB(Ta, Th);
Chris@82 134 TF = VADD(To, Tr);
Chris@82 135 TG = VADD(Tv, Ty);
Chris@82 136 TH = VADD(TF, TG);
Chris@82 137 TJ = VSUB(TF, TG);
Chris@82 138 }
Chris@82 139 }
Chris@82 140 TZ = VADD(T3, Ti);
Chris@82 141 T10 = VADD(TE, TH);
Chris@82 142 ST(&(xo[WS(os, 15)]), VFNMSI(T10, TZ), ovs, &(xo[WS(os, 1)]));
Chris@82 143 ST(&(xo[WS(os, 5)]), VFMAI(T10, TZ), ovs, &(xo[WS(os, 1)]));
Chris@82 144 {
Chris@82 145 V T1A, T1y, T1z, T1E, T1G, T1C, T1D, T1F, T1B;
Chris@82 146 T1A = VSUB(T1u, T1x);
Chris@82 147 T1y = VADD(T1u, T1x);
Chris@82 148 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
Chris@82 149 T1C = VSUB(T1s, T1t);
Chris@82 150 T1D = VSUB(T1v, T1w);
Chris@82 151 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
Chris@82 152 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
Chris@82 153 ST(&(xo[0]), VADD(T1r, T1y), ovs, &(xo[0]));
Chris@82 154 T1F = VFNMS(LDK(KP559016994), T1A, T1z);
Chris@82 155 ST(&(xo[WS(os, 8)]), VFMAI(T1G, T1F), ovs, &(xo[0]));
Chris@82 156 ST(&(xo[WS(os, 12)]), VFNMSI(T1G, T1F), ovs, &(xo[0]));
Chris@82 157 T1B = VFMA(LDK(KP559016994), T1A, T1z);
Chris@82 158 ST(&(xo[WS(os, 4)]), VFNMSI(T1E, T1B), ovs, &(xo[0]));
Chris@82 159 ST(&(xo[WS(os, 16)]), VFMAI(T1E, T1B), ovs, &(xo[0]));
Chris@82 160 }
Chris@82 161 {
Chris@82 162 V T1k, T1i, T1j, T1o, T1q, T1m, T1n, T1p, T1l;
Chris@82 163 T1k = VSUB(T1a, T1h);
Chris@82 164 T1i = VADD(T1a, T1h);
Chris@82 165 T1j = VFNMS(LDK(KP250000000), T1i, T13);
Chris@82 166 T1m = VSUB(T1d, T1g);
Chris@82 167 T1n = VSUB(T16, T19);
Chris@82 168 T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
Chris@82 169 T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
Chris@82 170 ST(&(xo[WS(os, 10)]), VADD(T13, T1i), ovs, &(xo[0]));
Chris@82 171 T1p = VFMA(LDK(KP559016994), T1k, T1j);
Chris@82 172 ST(&(xo[WS(os, 6)]), VFMAI(T1q, T1p), ovs, &(xo[0]));
Chris@82 173 ST(&(xo[WS(os, 14)]), VFNMSI(T1q, T1p), ovs, &(xo[0]));
Chris@82 174 T1l = VFNMS(LDK(KP559016994), T1k, T1j);
Chris@82 175 ST(&(xo[WS(os, 2)]), VFNMSI(T1o, T1l), ovs, &(xo[0]));
Chris@82 176 ST(&(xo[WS(os, 18)]), VFMAI(T1o, T1l), ovs, &(xo[0]));
Chris@82 177 }
Chris@82 178 {
Chris@82 179 V TA, TN, TV, TS, TK, TU, Tl, TR, TI, Tj;
Chris@82 180 TA = VFMA(LDK(KP618033988), Tz, Ts);
Chris@82 181 TN = VFMA(LDK(KP618033988), TM, TL);
Chris@82 182 TV = VFNMS(LDK(KP618033988), TL, TM);
Chris@82 183 TS = VFNMS(LDK(KP618033988), Ts, Tz);
Chris@82 184 TI = VFNMS(LDK(KP250000000), TH, TE);
Chris@82 185 TK = VFMA(LDK(KP559016994), TJ, TI);
Chris@82 186 TU = VFNMS(LDK(KP559016994), TJ, TI);
Chris@82 187 Tj = VFNMS(LDK(KP250000000), Ti, T3);
Chris@82 188 Tl = VFMA(LDK(KP559016994), Tk, Tj);
Chris@82 189 TR = VFNMS(LDK(KP559016994), Tk, Tj);
Chris@82 190 {
Chris@82 191 V TB, TO, TX, TY;
Chris@82 192 TB = VFNMS(LDK(KP951056516), TA, Tl);
Chris@82 193 TO = VFMA(LDK(KP951056516), TN, TK);
Chris@82 194 ST(&(xo[WS(os, 19)]), VFNMSI(TO, TB), ovs, &(xo[WS(os, 1)]));
Chris@82 195 ST(&(xo[WS(os, 1)]), VFMAI(TO, TB), ovs, &(xo[WS(os, 1)]));
Chris@82 196 TX = VFNMS(LDK(KP951056516), TS, TR);
Chris@82 197 TY = VFMA(LDK(KP951056516), TV, TU);
Chris@82 198 ST(&(xo[WS(os, 7)]), VFNMSI(TY, TX), ovs, &(xo[WS(os, 1)]));
Chris@82 199 ST(&(xo[WS(os, 13)]), VFMAI(TY, TX), ovs, &(xo[WS(os, 1)]));
Chris@82 200 }
Chris@82 201 {
Chris@82 202 V TP, TQ, TT, TW;
Chris@82 203 TP = VFMA(LDK(KP951056516), TA, Tl);
Chris@82 204 TQ = VFNMS(LDK(KP951056516), TN, TK);
Chris@82 205 ST(&(xo[WS(os, 11)]), VFNMSI(TQ, TP), ovs, &(xo[WS(os, 1)]));
Chris@82 206 ST(&(xo[WS(os, 9)]), VFMAI(TQ, TP), ovs, &(xo[WS(os, 1)]));
Chris@82 207 TT = VFMA(LDK(KP951056516), TS, TR);
Chris@82 208 TW = VFNMS(LDK(KP951056516), TV, TU);
Chris@82 209 ST(&(xo[WS(os, 3)]), VFNMSI(TW, TT), ovs, &(xo[WS(os, 1)]));
Chris@82 210 ST(&(xo[WS(os, 17)]), VFMAI(TW, TT), ovs, &(xo[WS(os, 1)]));
Chris@82 211 }
Chris@82 212 }
Chris@82 213 }
Chris@82 214 }
Chris@82 215 VLEAVE();
Chris@82 216 }
Chris@82 217
Chris@82 218 static const kdft_desc desc = { 20, XSIMD_STRING("n1bv_20"), {58, 4, 46, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 219
Chris@82 220 void XSIMD(codelet_n1bv_20) (planner *p) {
Chris@82 221 X(kdft_register) (p, n1bv_20, &desc);
Chris@82 222 }
Chris@82 223
Chris@82 224 #else
Chris@82 225
Chris@82 226 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 20 -name n1bv_20 -include dft/simd/n1b.h */
Chris@82 227
Chris@82 228 /*
Chris@82 229 * This function contains 104 FP additions, 24 FP multiplications,
Chris@82 230 * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
Chris@82 231 * 53 stack variables, 4 constants, and 40 memory accesses
Chris@82 232 */
Chris@82 233 #include "dft/simd/n1b.h"
Chris@82 234
Chris@82 235 static void n1bv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 236 {
Chris@82 237 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 238 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 239 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 240 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 241 {
Chris@82 242 INT i;
Chris@82 243 const R *xi;
Chris@82 244 R *xo;
Chris@82 245 xi = ii;
Chris@82 246 xo = io;
Chris@82 247 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@82 248 V T3, T1y, TH, T1i, Ts, TL, TM, Tz, T13, T16, T1j, T1u, T1v, T1w, T1r;
Chris@82 249 V T1s, T1t, T1a, T1d, T1k, Ti, Tk, TE, TI, TZ, T10;
Chris@82 250 {
Chris@82 251 V T1, T2, T1g, TF, TG, T1h;
Chris@82 252 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 253 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 254 T1g = VADD(T1, T2);
Chris@82 255 TF = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 256 TG = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 257 T1h = VADD(TF, TG);
Chris@82 258 T3 = VSUB(T1, T2);
Chris@82 259 T1y = VADD(T1g, T1h);
Chris@82 260 TH = VSUB(TF, TG);
Chris@82 261 T1i = VSUB(T1g, T1h);
Chris@82 262 }
Chris@82 263 {
Chris@82 264 V T6, T11, Tv, T19, Ty, T1c, T9, T14, Td, T18, To, T12, Tr, T15, Tg;
Chris@82 265 V T1b;
Chris@82 266 {
Chris@82 267 V T4, T5, Tt, Tu;
Chris@82 268 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 269 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 270 T6 = VSUB(T4, T5);
Chris@82 271 T11 = VADD(T4, T5);
Chris@82 272 Tt = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 273 Tu = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 274 Tv = VSUB(Tt, Tu);
Chris@82 275 T19 = VADD(Tt, Tu);
Chris@82 276 }
Chris@82 277 {
Chris@82 278 V Tw, Tx, T7, T8;
Chris@82 279 Tw = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 280 Tx = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 281 Ty = VSUB(Tw, Tx);
Chris@82 282 T1c = VADD(Tw, Tx);
Chris@82 283 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 284 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 285 T9 = VSUB(T7, T8);
Chris@82 286 T14 = VADD(T7, T8);
Chris@82 287 }
Chris@82 288 {
Chris@82 289 V Tb, Tc, Tm, Tn;
Chris@82 290 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 291 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 292 Td = VSUB(Tb, Tc);
Chris@82 293 T18 = VADD(Tb, Tc);
Chris@82 294 Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 295 Tn = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 296 To = VSUB(Tm, Tn);
Chris@82 297 T12 = VADD(Tm, Tn);
Chris@82 298 }
Chris@82 299 {
Chris@82 300 V Tp, Tq, Te, Tf;
Chris@82 301 Tp = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 302 Tq = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 303 Tr = VSUB(Tp, Tq);
Chris@82 304 T15 = VADD(Tp, Tq);
Chris@82 305 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 306 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 307 Tg = VSUB(Te, Tf);
Chris@82 308 T1b = VADD(Te, Tf);
Chris@82 309 }
Chris@82 310 Ts = VSUB(To, Tr);
Chris@82 311 TL = VSUB(T6, T9);
Chris@82 312 TM = VSUB(Td, Tg);
Chris@82 313 Tz = VSUB(Tv, Ty);
Chris@82 314 T13 = VSUB(T11, T12);
Chris@82 315 T16 = VSUB(T14, T15);
Chris@82 316 T1j = VADD(T13, T16);
Chris@82 317 T1u = VADD(T18, T19);
Chris@82 318 T1v = VADD(T1b, T1c);
Chris@82 319 T1w = VADD(T1u, T1v);
Chris@82 320 T1r = VADD(T11, T12);
Chris@82 321 T1s = VADD(T14, T15);
Chris@82 322 T1t = VADD(T1r, T1s);
Chris@82 323 T1a = VSUB(T18, T19);
Chris@82 324 T1d = VSUB(T1b, T1c);
Chris@82 325 T1k = VADD(T1a, T1d);
Chris@82 326 {
Chris@82 327 V Ta, Th, TC, TD;
Chris@82 328 Ta = VADD(T6, T9);
Chris@82 329 Th = VADD(Td, Tg);
Chris@82 330 Ti = VADD(Ta, Th);
Chris@82 331 Tk = VMUL(LDK(KP559016994), VSUB(Ta, Th));
Chris@82 332 TC = VADD(To, Tr);
Chris@82 333 TD = VADD(Tv, Ty);
Chris@82 334 TE = VMUL(LDK(KP559016994), VSUB(TC, TD));
Chris@82 335 TI = VADD(TC, TD);
Chris@82 336 }
Chris@82 337 }
Chris@82 338 TZ = VADD(T3, Ti);
Chris@82 339 T10 = VBYI(VADD(TH, TI));
Chris@82 340 ST(&(xo[WS(os, 15)]), VSUB(TZ, T10), ovs, &(xo[WS(os, 1)]));
Chris@82 341 ST(&(xo[WS(os, 5)]), VADD(TZ, T10), ovs, &(xo[WS(os, 1)]));
Chris@82 342 {
Chris@82 343 V T1x, T1z, T1A, T1E, T1G, T1C, T1D, T1F, T1B;
Chris@82 344 T1x = VMUL(LDK(KP559016994), VSUB(T1t, T1w));
Chris@82 345 T1z = VADD(T1t, T1w);
Chris@82 346 T1A = VFNMS(LDK(KP250000000), T1z, T1y);
Chris@82 347 T1C = VSUB(T1r, T1s);
Chris@82 348 T1D = VSUB(T1u, T1v);
Chris@82 349 T1E = VBYI(VFMA(LDK(KP951056516), T1C, VMUL(LDK(KP587785252), T1D)));
Chris@82 350 T1G = VBYI(VFNMS(LDK(KP951056516), T1D, VMUL(LDK(KP587785252), T1C)));
Chris@82 351 ST(&(xo[0]), VADD(T1y, T1z), ovs, &(xo[0]));
Chris@82 352 T1F = VSUB(T1A, T1x);
Chris@82 353 ST(&(xo[WS(os, 8)]), VSUB(T1F, T1G), ovs, &(xo[0]));
Chris@82 354 ST(&(xo[WS(os, 12)]), VADD(T1G, T1F), ovs, &(xo[0]));
Chris@82 355 T1B = VADD(T1x, T1A);
Chris@82 356 ST(&(xo[WS(os, 4)]), VSUB(T1B, T1E), ovs, &(xo[0]));
Chris@82 357 ST(&(xo[WS(os, 16)]), VADD(T1E, T1B), ovs, &(xo[0]));
Chris@82 358 }
Chris@82 359 {
Chris@82 360 V T1n, T1l, T1m, T1f, T1p, T17, T1e, T1q, T1o;
Chris@82 361 T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
Chris@82 362 T1l = VADD(T1j, T1k);
Chris@82 363 T1m = VFNMS(LDK(KP250000000), T1l, T1i);
Chris@82 364 T17 = VSUB(T13, T16);
Chris@82 365 T1e = VSUB(T1a, T1d);
Chris@82 366 T1f = VBYI(VFNMS(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
Chris@82 367 T1p = VBYI(VFMA(LDK(KP951056516), T17, VMUL(LDK(KP587785252), T1e)));
Chris@82 368 ST(&(xo[WS(os, 10)]), VADD(T1i, T1l), ovs, &(xo[0]));
Chris@82 369 T1q = VADD(T1n, T1m);
Chris@82 370 ST(&(xo[WS(os, 6)]), VADD(T1p, T1q), ovs, &(xo[0]));
Chris@82 371 ST(&(xo[WS(os, 14)]), VSUB(T1q, T1p), ovs, &(xo[0]));
Chris@82 372 T1o = VSUB(T1m, T1n);
Chris@82 373 ST(&(xo[WS(os, 2)]), VADD(T1f, T1o), ovs, &(xo[0]));
Chris@82 374 ST(&(xo[WS(os, 18)]), VSUB(T1o, T1f), ovs, &(xo[0]));
Chris@82 375 }
Chris@82 376 {
Chris@82 377 V TA, TN, TU, TS, TK, TV, Tl, TR, TJ, Tj;
Chris@82 378 TA = VFNMS(LDK(KP951056516), Tz, VMUL(LDK(KP587785252), Ts));
Chris@82 379 TN = VFNMS(LDK(KP951056516), TM, VMUL(LDK(KP587785252), TL));
Chris@82 380 TU = VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TM));
Chris@82 381 TS = VFMA(LDK(KP951056516), Ts, VMUL(LDK(KP587785252), Tz));
Chris@82 382 TJ = VFNMS(LDK(KP250000000), TI, TH);
Chris@82 383 TK = VSUB(TE, TJ);
Chris@82 384 TV = VADD(TE, TJ);
Chris@82 385 Tj = VFNMS(LDK(KP250000000), Ti, T3);
Chris@82 386 Tl = VSUB(Tj, Tk);
Chris@82 387 TR = VADD(Tk, Tj);
Chris@82 388 {
Chris@82 389 V TB, TO, TX, TY;
Chris@82 390 TB = VSUB(Tl, TA);
Chris@82 391 TO = VBYI(VSUB(TK, TN));
Chris@82 392 ST(&(xo[WS(os, 17)]), VSUB(TB, TO), ovs, &(xo[WS(os, 1)]));
Chris@82 393 ST(&(xo[WS(os, 3)]), VADD(TB, TO), ovs, &(xo[WS(os, 1)]));
Chris@82 394 TX = VADD(TR, TS);
Chris@82 395 TY = VBYI(VSUB(TV, TU));
Chris@82 396 ST(&(xo[WS(os, 11)]), VSUB(TX, TY), ovs, &(xo[WS(os, 1)]));
Chris@82 397 ST(&(xo[WS(os, 9)]), VADD(TX, TY), ovs, &(xo[WS(os, 1)]));
Chris@82 398 }
Chris@82 399 {
Chris@82 400 V TP, TQ, TT, TW;
Chris@82 401 TP = VADD(Tl, TA);
Chris@82 402 TQ = VBYI(VADD(TN, TK));
Chris@82 403 ST(&(xo[WS(os, 13)]), VSUB(TP, TQ), ovs, &(xo[WS(os, 1)]));
Chris@82 404 ST(&(xo[WS(os, 7)]), VADD(TP, TQ), ovs, &(xo[WS(os, 1)]));
Chris@82 405 TT = VSUB(TR, TS);
Chris@82 406 TW = VBYI(VADD(TU, TV));
Chris@82 407 ST(&(xo[WS(os, 19)]), VSUB(TT, TW), ovs, &(xo[WS(os, 1)]));
Chris@82 408 ST(&(xo[WS(os, 1)]), VADD(TT, TW), ovs, &(xo[WS(os, 1)]));
Chris@82 409 }
Chris@82 410 }
Chris@82 411 }
Chris@82 412 }
Chris@82 413 VLEAVE();
Chris@82 414 }
Chris@82 415
Chris@82 416 static const kdft_desc desc = { 20, XSIMD_STRING("n1bv_20"), {92, 12, 12, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 417
Chris@82 418 void XSIMD(codelet_n1bv_20) (planner *p) {
Chris@82 419 X(kdft_register) (p, n1bv_20, &desc);
Chris@82 420 }
Chris@82 421
Chris@82 422 #endif