annotate src/fftw-3.3.8/dft/simd/common/n1fv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:53 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n1fv_20 -include dft/simd/n1f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 104 FP additions, 50 FP multiplications,
Chris@82 32 * (or, 58 additions, 4 multiplications, 46 fused multiply/add),
Chris@82 33 * 53 stack variables, 4 constants, and 40 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n1f.h"
Chris@82 36
Chris@82 37 static void n1fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT i;
Chris@82 45 const R *xi;
Chris@82 46 R *xo;
Chris@82 47 xi = ri;
Chris@82 48 xo = ro;
Chris@82 49 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@82 50 V T3, T1r, Tm, T13, TG, TN, TO, TH, T16, T19, T1a, T1v, T1w, T1x, T1s;
Chris@82 51 V T1t, T1u, T1d, T1g, T1h, Ti, TE, TB, TL, Tj, TC;
Chris@82 52 {
Chris@82 53 V T1, T2, T11, Tk, Tl, T12;
Chris@82 54 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 55 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 56 T11 = VADD(T1, T2);
Chris@82 57 Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 58 Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 59 T12 = VADD(Tk, Tl);
Chris@82 60 T3 = VSUB(T1, T2);
Chris@82 61 T1r = VADD(T11, T12);
Chris@82 62 Tm = VSUB(Tk, Tl);
Chris@82 63 T13 = VSUB(T11, T12);
Chris@82 64 }
Chris@82 65 {
Chris@82 66 V T6, T14, Tw, T1c, Tz, T1f, T9, T17, Td, T1b, Tp, T15, Ts, T18, Tg;
Chris@82 67 V T1e;
Chris@82 68 {
Chris@82 69 V T4, T5, Tu, Tv;
Chris@82 70 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 71 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 72 T6 = VSUB(T4, T5);
Chris@82 73 T14 = VADD(T4, T5);
Chris@82 74 Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 75 Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 76 Tw = VSUB(Tu, Tv);
Chris@82 77 T1c = VADD(Tu, Tv);
Chris@82 78 }
Chris@82 79 {
Chris@82 80 V Tx, Ty, T7, T8;
Chris@82 81 Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 82 Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 83 Tz = VSUB(Tx, Ty);
Chris@82 84 T1f = VADD(Tx, Ty);
Chris@82 85 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 86 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 87 T9 = VSUB(T7, T8);
Chris@82 88 T17 = VADD(T7, T8);
Chris@82 89 }
Chris@82 90 {
Chris@82 91 V Tb, Tc, Tn, To;
Chris@82 92 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 93 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 94 Td = VSUB(Tb, Tc);
Chris@82 95 T1b = VADD(Tb, Tc);
Chris@82 96 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 97 To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 98 Tp = VSUB(Tn, To);
Chris@82 99 T15 = VADD(Tn, To);
Chris@82 100 }
Chris@82 101 {
Chris@82 102 V Tq, Tr, Te, Tf;
Chris@82 103 Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 104 Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 105 Ts = VSUB(Tq, Tr);
Chris@82 106 T18 = VADD(Tq, Tr);
Chris@82 107 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 108 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 109 Tg = VSUB(Te, Tf);
Chris@82 110 T1e = VADD(Te, Tf);
Chris@82 111 }
Chris@82 112 TG = VSUB(Ts, Tp);
Chris@82 113 TN = VSUB(T6, T9);
Chris@82 114 TO = VSUB(Td, Tg);
Chris@82 115 TH = VSUB(Tz, Tw);
Chris@82 116 T16 = VSUB(T14, T15);
Chris@82 117 T19 = VSUB(T17, T18);
Chris@82 118 T1a = VADD(T16, T19);
Chris@82 119 T1v = VADD(T1b, T1c);
Chris@82 120 T1w = VADD(T1e, T1f);
Chris@82 121 T1x = VADD(T1v, T1w);
Chris@82 122 T1s = VADD(T14, T15);
Chris@82 123 T1t = VADD(T17, T18);
Chris@82 124 T1u = VADD(T1s, T1t);
Chris@82 125 T1d = VSUB(T1b, T1c);
Chris@82 126 T1g = VSUB(T1e, T1f);
Chris@82 127 T1h = VADD(T1d, T1g);
Chris@82 128 {
Chris@82 129 V Ta, Th, Tt, TA;
Chris@82 130 Ta = VADD(T6, T9);
Chris@82 131 Th = VADD(Td, Tg);
Chris@82 132 Ti = VADD(Ta, Th);
Chris@82 133 TE = VSUB(Ta, Th);
Chris@82 134 Tt = VADD(Tp, Ts);
Chris@82 135 TA = VADD(Tw, Tz);
Chris@82 136 TB = VADD(Tt, TA);
Chris@82 137 TL = VSUB(TA, Tt);
Chris@82 138 }
Chris@82 139 }
Chris@82 140 Tj = VADD(T3, Ti);
Chris@82 141 TC = VADD(Tm, TB);
Chris@82 142 ST(&(xo[WS(os, 5)]), VFNMSI(TC, Tj), ovs, &(xo[WS(os, 1)]));
Chris@82 143 ST(&(xo[WS(os, 15)]), VFMAI(TC, Tj), ovs, &(xo[WS(os, 1)]));
Chris@82 144 {
Chris@82 145 V T1A, T1y, T1z, T1E, T1G, T1C, T1D, T1F, T1B;
Chris@82 146 T1A = VSUB(T1u, T1x);
Chris@82 147 T1y = VADD(T1u, T1x);
Chris@82 148 T1z = VFNMS(LDK(KP250000000), T1y, T1r);
Chris@82 149 T1C = VSUB(T1s, T1t);
Chris@82 150 T1D = VSUB(T1v, T1w);
Chris@82 151 T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
Chris@82 152 T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
Chris@82 153 ST(&(xo[0]), VADD(T1r, T1y), ovs, &(xo[0]));
Chris@82 154 T1F = VFNMS(LDK(KP559016994), T1A, T1z);
Chris@82 155 ST(&(xo[WS(os, 8)]), VFNMSI(T1G, T1F), ovs, &(xo[0]));
Chris@82 156 ST(&(xo[WS(os, 12)]), VFMAI(T1G, T1F), ovs, &(xo[0]));
Chris@82 157 T1B = VFMA(LDK(KP559016994), T1A, T1z);
Chris@82 158 ST(&(xo[WS(os, 4)]), VFMAI(T1E, T1B), ovs, &(xo[0]));
Chris@82 159 ST(&(xo[WS(os, 16)]), VFNMSI(T1E, T1B), ovs, &(xo[0]));
Chris@82 160 }
Chris@82 161 {
Chris@82 162 V T1k, T1i, T1j, T1o, T1q, T1m, T1n, T1p, T1l;
Chris@82 163 T1k = VSUB(T1a, T1h);
Chris@82 164 T1i = VADD(T1a, T1h);
Chris@82 165 T1j = VFNMS(LDK(KP250000000), T1i, T13);
Chris@82 166 T1m = VSUB(T1d, T1g);
Chris@82 167 T1n = VSUB(T16, T19);
Chris@82 168 T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
Chris@82 169 T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
Chris@82 170 ST(&(xo[WS(os, 10)]), VADD(T13, T1i), ovs, &(xo[0]));
Chris@82 171 T1p = VFMA(LDK(KP559016994), T1k, T1j);
Chris@82 172 ST(&(xo[WS(os, 6)]), VFNMSI(T1q, T1p), ovs, &(xo[0]));
Chris@82 173 ST(&(xo[WS(os, 14)]), VFMAI(T1q, T1p), ovs, &(xo[0]));
Chris@82 174 T1l = VFNMS(LDK(KP559016994), T1k, T1j);
Chris@82 175 ST(&(xo[WS(os, 2)]), VFMAI(T1o, T1l), ovs, &(xo[0]));
Chris@82 176 ST(&(xo[WS(os, 18)]), VFNMSI(T1o, T1l), ovs, &(xo[0]));
Chris@82 177 }
Chris@82 178 {
Chris@82 179 V TI, TP, TX, TU, TM, TW, TF, TT, TK, TD;
Chris@82 180 TI = VFMA(LDK(KP618033988), TH, TG);
Chris@82 181 TP = VFMA(LDK(KP618033988), TO, TN);
Chris@82 182 TX = VFNMS(LDK(KP618033988), TN, TO);
Chris@82 183 TU = VFNMS(LDK(KP618033988), TG, TH);
Chris@82 184 TK = VFNMS(LDK(KP250000000), TB, Tm);
Chris@82 185 TM = VFNMS(LDK(KP559016994), TL, TK);
Chris@82 186 TW = VFMA(LDK(KP559016994), TL, TK);
Chris@82 187 TD = VFNMS(LDK(KP250000000), Ti, T3);
Chris@82 188 TF = VFMA(LDK(KP559016994), TE, TD);
Chris@82 189 TT = VFNMS(LDK(KP559016994), TE, TD);
Chris@82 190 {
Chris@82 191 V TJ, TQ, TZ, T10;
Chris@82 192 TJ = VFMA(LDK(KP951056516), TI, TF);
Chris@82 193 TQ = VFMA(LDK(KP951056516), TP, TM);
Chris@82 194 ST(&(xo[WS(os, 1)]), VFNMSI(TQ, TJ), ovs, &(xo[WS(os, 1)]));
Chris@82 195 ST(&(xo[WS(os, 19)]), VFMAI(TQ, TJ), ovs, &(xo[WS(os, 1)]));
Chris@82 196 TZ = VFMA(LDK(KP951056516), TU, TT);
Chris@82 197 T10 = VFMA(LDK(KP951056516), TX, TW);
Chris@82 198 ST(&(xo[WS(os, 13)]), VFNMSI(T10, TZ), ovs, &(xo[WS(os, 1)]));
Chris@82 199 ST(&(xo[WS(os, 7)]), VFMAI(T10, TZ), ovs, &(xo[WS(os, 1)]));
Chris@82 200 }
Chris@82 201 {
Chris@82 202 V TR, TS, TV, TY;
Chris@82 203 TR = VFNMS(LDK(KP951056516), TI, TF);
Chris@82 204 TS = VFNMS(LDK(KP951056516), TP, TM);
Chris@82 205 ST(&(xo[WS(os, 9)]), VFNMSI(TS, TR), ovs, &(xo[WS(os, 1)]));
Chris@82 206 ST(&(xo[WS(os, 11)]), VFMAI(TS, TR), ovs, &(xo[WS(os, 1)]));
Chris@82 207 TV = VFNMS(LDK(KP951056516), TU, TT);
Chris@82 208 TY = VFNMS(LDK(KP951056516), TX, TW);
Chris@82 209 ST(&(xo[WS(os, 17)]), VFNMSI(TY, TV), ovs, &(xo[WS(os, 1)]));
Chris@82 210 ST(&(xo[WS(os, 3)]), VFMAI(TY, TV), ovs, &(xo[WS(os, 1)]));
Chris@82 211 }
Chris@82 212 }
Chris@82 213 }
Chris@82 214 }
Chris@82 215 VLEAVE();
Chris@82 216 }
Chris@82 217
Chris@82 218 static const kdft_desc desc = { 20, XSIMD_STRING("n1fv_20"), {58, 4, 46, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 219
Chris@82 220 void XSIMD(codelet_n1fv_20) (planner *p) {
Chris@82 221 X(kdft_register) (p, n1fv_20, &desc);
Chris@82 222 }
Chris@82 223
Chris@82 224 #else
Chris@82 225
Chris@82 226 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n1fv_20 -include dft/simd/n1f.h */
Chris@82 227
Chris@82 228 /*
Chris@82 229 * This function contains 104 FP additions, 24 FP multiplications,
Chris@82 230 * (or, 92 additions, 12 multiplications, 12 fused multiply/add),
Chris@82 231 * 53 stack variables, 4 constants, and 40 memory accesses
Chris@82 232 */
Chris@82 233 #include "dft/simd/n1f.h"
Chris@82 234
Chris@82 235 static void n1fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 236 {
Chris@82 237 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 238 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 239 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 240 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 241 {
Chris@82 242 INT i;
Chris@82 243 const R *xi;
Chris@82 244 R *xo;
Chris@82 245 xi = ri;
Chris@82 246 xo = ro;
Chris@82 247 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@82 248 V T3, T1B, Tm, T1i, TG, TN, TO, TH, T13, T16, T1k, T1u, T1v, T1z, T1r;
Chris@82 249 V T1s, T1y, T1a, T1d, T1j, Ti, TD, TB, TL, Tj, TC;
Chris@82 250 {
Chris@82 251 V T1, T2, T1g, Tk, Tl, T1h;
Chris@82 252 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 253 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 254 T1g = VADD(T1, T2);
Chris@82 255 Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 256 Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 257 T1h = VADD(Tk, Tl);
Chris@82 258 T3 = VSUB(T1, T2);
Chris@82 259 T1B = VADD(T1g, T1h);
Chris@82 260 Tm = VSUB(Tk, Tl);
Chris@82 261 T1i = VSUB(T1g, T1h);
Chris@82 262 }
Chris@82 263 {
Chris@82 264 V T6, T18, Tw, T12, Tz, T15, T9, T1b, Td, T11, Tp, T19, Ts, T1c, Tg;
Chris@82 265 V T14;
Chris@82 266 {
Chris@82 267 V T4, T5, Tu, Tv;
Chris@82 268 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 269 T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 270 T6 = VSUB(T4, T5);
Chris@82 271 T18 = VADD(T4, T5);
Chris@82 272 Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 273 Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 274 Tw = VSUB(Tu, Tv);
Chris@82 275 T12 = VADD(Tu, Tv);
Chris@82 276 }
Chris@82 277 {
Chris@82 278 V Tx, Ty, T7, T8;
Chris@82 279 Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 280 Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 281 Tz = VSUB(Tx, Ty);
Chris@82 282 T15 = VADD(Tx, Ty);
Chris@82 283 T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 284 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 285 T9 = VSUB(T7, T8);
Chris@82 286 T1b = VADD(T7, T8);
Chris@82 287 }
Chris@82 288 {
Chris@82 289 V Tb, Tc, Tn, To;
Chris@82 290 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 291 Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 292 Td = VSUB(Tb, Tc);
Chris@82 293 T11 = VADD(Tb, Tc);
Chris@82 294 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 295 To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 296 Tp = VSUB(Tn, To);
Chris@82 297 T19 = VADD(Tn, To);
Chris@82 298 }
Chris@82 299 {
Chris@82 300 V Tq, Tr, Te, Tf;
Chris@82 301 Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 302 Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 303 Ts = VSUB(Tq, Tr);
Chris@82 304 T1c = VADD(Tq, Tr);
Chris@82 305 Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 306 Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 307 Tg = VSUB(Te, Tf);
Chris@82 308 T14 = VADD(Te, Tf);
Chris@82 309 }
Chris@82 310 TG = VSUB(Ts, Tp);
Chris@82 311 TN = VSUB(T6, T9);
Chris@82 312 TO = VSUB(Td, Tg);
Chris@82 313 TH = VSUB(Tz, Tw);
Chris@82 314 T13 = VSUB(T11, T12);
Chris@82 315 T16 = VSUB(T14, T15);
Chris@82 316 T1k = VADD(T13, T16);
Chris@82 317 T1u = VADD(T11, T12);
Chris@82 318 T1v = VADD(T14, T15);
Chris@82 319 T1z = VADD(T1u, T1v);
Chris@82 320 T1r = VADD(T18, T19);
Chris@82 321 T1s = VADD(T1b, T1c);
Chris@82 322 T1y = VADD(T1r, T1s);
Chris@82 323 T1a = VSUB(T18, T19);
Chris@82 324 T1d = VSUB(T1b, T1c);
Chris@82 325 T1j = VADD(T1a, T1d);
Chris@82 326 {
Chris@82 327 V Ta, Th, Tt, TA;
Chris@82 328 Ta = VADD(T6, T9);
Chris@82 329 Th = VADD(Td, Tg);
Chris@82 330 Ti = VADD(Ta, Th);
Chris@82 331 TD = VMUL(LDK(KP559016994), VSUB(Ta, Th));
Chris@82 332 Tt = VADD(Tp, Ts);
Chris@82 333 TA = VADD(Tw, Tz);
Chris@82 334 TB = VADD(Tt, TA);
Chris@82 335 TL = VMUL(LDK(KP559016994), VSUB(TA, Tt));
Chris@82 336 }
Chris@82 337 }
Chris@82 338 Tj = VADD(T3, Ti);
Chris@82 339 TC = VBYI(VADD(Tm, TB));
Chris@82 340 ST(&(xo[WS(os, 5)]), VSUB(Tj, TC), ovs, &(xo[WS(os, 1)]));
Chris@82 341 ST(&(xo[WS(os, 15)]), VADD(Tj, TC), ovs, &(xo[WS(os, 1)]));
Chris@82 342 {
Chris@82 343 V T1A, T1C, T1D, T1x, T1G, T1t, T1w, T1F, T1E;
Chris@82 344 T1A = VMUL(LDK(KP559016994), VSUB(T1y, T1z));
Chris@82 345 T1C = VADD(T1y, T1z);
Chris@82 346 T1D = VFNMS(LDK(KP250000000), T1C, T1B);
Chris@82 347 T1t = VSUB(T1r, T1s);
Chris@82 348 T1w = VSUB(T1u, T1v);
Chris@82 349 T1x = VBYI(VFMA(LDK(KP951056516), T1t, VMUL(LDK(KP587785252), T1w)));
Chris@82 350 T1G = VBYI(VFNMS(LDK(KP587785252), T1t, VMUL(LDK(KP951056516), T1w)));
Chris@82 351 ST(&(xo[0]), VADD(T1B, T1C), ovs, &(xo[0]));
Chris@82 352 T1F = VSUB(T1D, T1A);
Chris@82 353 ST(&(xo[WS(os, 8)]), VSUB(T1F, T1G), ovs, &(xo[0]));
Chris@82 354 ST(&(xo[WS(os, 12)]), VADD(T1G, T1F), ovs, &(xo[0]));
Chris@82 355 T1E = VADD(T1A, T1D);
Chris@82 356 ST(&(xo[WS(os, 4)]), VADD(T1x, T1E), ovs, &(xo[0]));
Chris@82 357 ST(&(xo[WS(os, 16)]), VSUB(T1E, T1x), ovs, &(xo[0]));
Chris@82 358 }
Chris@82 359 {
Chris@82 360 V T1n, T1l, T1m, T1f, T1q, T17, T1e, T1p, T1o;
Chris@82 361 T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
Chris@82 362 T1l = VADD(T1j, T1k);
Chris@82 363 T1m = VFNMS(LDK(KP250000000), T1l, T1i);
Chris@82 364 T17 = VSUB(T13, T16);
Chris@82 365 T1e = VSUB(T1a, T1d);
Chris@82 366 T1f = VBYI(VFNMS(LDK(KP587785252), T1e, VMUL(LDK(KP951056516), T17)));
Chris@82 367 T1q = VBYI(VFMA(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
Chris@82 368 ST(&(xo[WS(os, 10)]), VADD(T1i, T1l), ovs, &(xo[0]));
Chris@82 369 T1p = VADD(T1n, T1m);
Chris@82 370 ST(&(xo[WS(os, 6)]), VSUB(T1p, T1q), ovs, &(xo[0]));
Chris@82 371 ST(&(xo[WS(os, 14)]), VADD(T1q, T1p), ovs, &(xo[0]));
Chris@82 372 T1o = VSUB(T1m, T1n);
Chris@82 373 ST(&(xo[WS(os, 2)]), VADD(T1f, T1o), ovs, &(xo[0]));
Chris@82 374 ST(&(xo[WS(os, 18)]), VSUB(T1o, T1f), ovs, &(xo[0]));
Chris@82 375 }
Chris@82 376 {
Chris@82 377 V TI, TP, TX, TU, TM, TW, TF, TT, TK, TE;
Chris@82 378 TI = VFMA(LDK(KP951056516), TG, VMUL(LDK(KP587785252), TH));
Chris@82 379 TP = VFMA(LDK(KP951056516), TN, VMUL(LDK(KP587785252), TO));
Chris@82 380 TX = VFNMS(LDK(KP587785252), TN, VMUL(LDK(KP951056516), TO));
Chris@82 381 TU = VFNMS(LDK(KP587785252), TG, VMUL(LDK(KP951056516), TH));
Chris@82 382 TK = VFMS(LDK(KP250000000), TB, Tm);
Chris@82 383 TM = VADD(TK, TL);
Chris@82 384 TW = VSUB(TL, TK);
Chris@82 385 TE = VFNMS(LDK(KP250000000), Ti, T3);
Chris@82 386 TF = VADD(TD, TE);
Chris@82 387 TT = VSUB(TE, TD);
Chris@82 388 {
Chris@82 389 V TJ, TQ, TZ, T10;
Chris@82 390 TJ = VADD(TF, TI);
Chris@82 391 TQ = VBYI(VSUB(TM, TP));
Chris@82 392 ST(&(xo[WS(os, 19)]), VSUB(TJ, TQ), ovs, &(xo[WS(os, 1)]));
Chris@82 393 ST(&(xo[WS(os, 1)]), VADD(TJ, TQ), ovs, &(xo[WS(os, 1)]));
Chris@82 394 TZ = VADD(TT, TU);
Chris@82 395 T10 = VBYI(VADD(TX, TW));
Chris@82 396 ST(&(xo[WS(os, 13)]), VSUB(TZ, T10), ovs, &(xo[WS(os, 1)]));
Chris@82 397 ST(&(xo[WS(os, 7)]), VADD(TZ, T10), ovs, &(xo[WS(os, 1)]));
Chris@82 398 }
Chris@82 399 {
Chris@82 400 V TR, TS, TV, TY;
Chris@82 401 TR = VSUB(TF, TI);
Chris@82 402 TS = VBYI(VADD(TP, TM));
Chris@82 403 ST(&(xo[WS(os, 11)]), VSUB(TR, TS), ovs, &(xo[WS(os, 1)]));
Chris@82 404 ST(&(xo[WS(os, 9)]), VADD(TR, TS), ovs, &(xo[WS(os, 1)]));
Chris@82 405 TV = VSUB(TT, TU);
Chris@82 406 TY = VBYI(VSUB(TW, TX));
Chris@82 407 ST(&(xo[WS(os, 17)]), VSUB(TV, TY), ovs, &(xo[WS(os, 1)]));
Chris@82 408 ST(&(xo[WS(os, 3)]), VADD(TV, TY), ovs, &(xo[WS(os, 1)]));
Chris@82 409 }
Chris@82 410 }
Chris@82 411 }
Chris@82 412 }
Chris@82 413 VLEAVE();
Chris@82 414 }
Chris@82 415
Chris@82 416 static const kdft_desc desc = { 20, XSIMD_STRING("n1fv_20"), {92, 12, 12, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 417
Chris@82 418 void XSIMD(codelet_n1fv_20) (planner *p) {
Chris@82 419 X(kdft_register) (p, n1fv_20, &desc);
Chris@82 420 }
Chris@82 421
Chris@82 422 #endif