annotate src/fftw-3.3.8/dft/simd/common/n1fv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:52 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n1fv_32 -include dft/simd/n1f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 186 FP additions, 98 FP multiplications,
Chris@82 32 * (or, 88 additions, 0 multiplications, 98 fused multiply/add),
Chris@82 33 * 58 stack variables, 7 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n1f.h"
Chris@82 36
Chris@82 37 static void n1fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 40 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 41 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 42 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 44 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 46 {
Chris@82 47 INT i;
Chris@82 48 const R *xi;
Chris@82 49 R *xo;
Chris@82 50 xi = ri;
Chris@82 51 xo = ro;
Chris@82 52 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@82 53 V T1T, T1W, T2K, T2x, T16, T1A, Tb, T1p, TT, T1v, TY, T1w, T27, T2a, T2b;
Chris@82 54 V T2H, T2N, TC, T1s, TH, T1t, T20, T23, T24, T2E, T2O, T2g, T2j, Tq, T1B;
Chris@82 55 V T19, T1q, T2A, T2L;
Chris@82 56 {
Chris@82 57 V T3, T1R, T14, T1S, T6, T1U, T9, T1V, T15, Ta;
Chris@82 58 {
Chris@82 59 V T1, T2, T12, T13;
Chris@82 60 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 61 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 62 T3 = VSUB(T1, T2);
Chris@82 63 T1R = VADD(T1, T2);
Chris@82 64 T12 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 65 T13 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 66 T14 = VSUB(T12, T13);
Chris@82 67 T1S = VADD(T12, T13);
Chris@82 68 }
Chris@82 69 {
Chris@82 70 V T4, T5, T7, T8;
Chris@82 71 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 72 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 73 T6 = VSUB(T4, T5);
Chris@82 74 T1U = VADD(T4, T5);
Chris@82 75 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 76 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 77 T9 = VSUB(T7, T8);
Chris@82 78 T1V = VADD(T7, T8);
Chris@82 79 }
Chris@82 80 T1T = VADD(T1R, T1S);
Chris@82 81 T1W = VADD(T1U, T1V);
Chris@82 82 T2K = VSUB(T1V, T1U);
Chris@82 83 T2x = VSUB(T1R, T1S);
Chris@82 84 T15 = VSUB(T9, T6);
Chris@82 85 T16 = VFNMS(LDK(KP707106781), T15, T14);
Chris@82 86 T1A = VFMA(LDK(KP707106781), T15, T14);
Chris@82 87 Ta = VADD(T6, T9);
Chris@82 88 Tb = VFMA(LDK(KP707106781), Ta, T3);
Chris@82 89 T1p = VFNMS(LDK(KP707106781), Ta, T3);
Chris@82 90 }
Chris@82 91 {
Chris@82 92 V TL, T25, TW, T26, TO, T28, TR, T29;
Chris@82 93 {
Chris@82 94 V TJ, TK, TU, TV;
Chris@82 95 TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 96 TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 97 TL = VSUB(TJ, TK);
Chris@82 98 T25 = VADD(TJ, TK);
Chris@82 99 TU = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 100 TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 101 TW = VSUB(TU, TV);
Chris@82 102 T26 = VADD(TV, TU);
Chris@82 103 }
Chris@82 104 {
Chris@82 105 V TM, TN, TP, TQ;
Chris@82 106 TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 107 TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 108 TO = VSUB(TM, TN);
Chris@82 109 T28 = VADD(TM, TN);
Chris@82 110 TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 111 TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 112 TR = VSUB(TP, TQ);
Chris@82 113 T29 = VADD(TP, TQ);
Chris@82 114 }
Chris@82 115 {
Chris@82 116 V TS, TX, T2F, T2G;
Chris@82 117 TS = VADD(TO, TR);
Chris@82 118 TT = VFMA(LDK(KP707106781), TS, TL);
Chris@82 119 T1v = VFNMS(LDK(KP707106781), TS, TL);
Chris@82 120 TX = VSUB(TR, TO);
Chris@82 121 TY = VFMA(LDK(KP707106781), TX, TW);
Chris@82 122 T1w = VFNMS(LDK(KP707106781), TX, TW);
Chris@82 123 T27 = VADD(T25, T26);
Chris@82 124 T2a = VADD(T28, T29);
Chris@82 125 T2b = VSUB(T27, T2a);
Chris@82 126 T2F = VSUB(T25, T26);
Chris@82 127 T2G = VSUB(T29, T28);
Chris@82 128 T2H = VFNMS(LDK(KP414213562), T2G, T2F);
Chris@82 129 T2N = VFMA(LDK(KP414213562), T2F, T2G);
Chris@82 130 }
Chris@82 131 }
Chris@82 132 {
Chris@82 133 V Tu, T1Y, TF, T1Z, Tx, T21, TA, T22;
Chris@82 134 {
Chris@82 135 V Ts, Tt, TD, TE;
Chris@82 136 Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 137 Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 138 Tu = VSUB(Ts, Tt);
Chris@82 139 T1Y = VADD(Ts, Tt);
Chris@82 140 TD = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 141 TE = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 142 TF = VSUB(TD, TE);
Chris@82 143 T1Z = VADD(TD, TE);
Chris@82 144 }
Chris@82 145 {
Chris@82 146 V Tv, Tw, Ty, Tz;
Chris@82 147 Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 148 Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 149 Tx = VSUB(Tv, Tw);
Chris@82 150 T21 = VADD(Tv, Tw);
Chris@82 151 Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 152 Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 153 TA = VSUB(Ty, Tz);
Chris@82 154 T22 = VADD(Ty, Tz);
Chris@82 155 }
Chris@82 156 {
Chris@82 157 V TB, TG, T2C, T2D;
Chris@82 158 TB = VADD(Tx, TA);
Chris@82 159 TC = VFMA(LDK(KP707106781), TB, Tu);
Chris@82 160 T1s = VFNMS(LDK(KP707106781), TB, Tu);
Chris@82 161 TG = VSUB(Tx, TA);
Chris@82 162 TH = VFMA(LDK(KP707106781), TG, TF);
Chris@82 163 T1t = VFNMS(LDK(KP707106781), TG, TF);
Chris@82 164 T20 = VADD(T1Y, T1Z);
Chris@82 165 T23 = VADD(T21, T22);
Chris@82 166 T24 = VSUB(T20, T23);
Chris@82 167 T2C = VSUB(T1Y, T1Z);
Chris@82 168 T2D = VSUB(T21, T22);
Chris@82 169 T2E = VFNMS(LDK(KP414213562), T2D, T2C);
Chris@82 170 T2O = VFMA(LDK(KP414213562), T2C, T2D);
Chris@82 171 }
Chris@82 172 }
Chris@82 173 {
Chris@82 174 V Te, T2h, To, T2f, Th, T2i, Tl, T2e, Ti, Tp;
Chris@82 175 {
Chris@82 176 V Tc, Td, Tm, Tn;
Chris@82 177 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 178 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 179 Te = VSUB(Tc, Td);
Chris@82 180 T2h = VADD(Tc, Td);
Chris@82 181 Tm = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 182 Tn = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 183 To = VSUB(Tm, Tn);
Chris@82 184 T2f = VADD(Tn, Tm);
Chris@82 185 }
Chris@82 186 {
Chris@82 187 V Tf, Tg, Tj, Tk;
Chris@82 188 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 189 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 190 Th = VSUB(Tf, Tg);
Chris@82 191 T2i = VADD(Tf, Tg);
Chris@82 192 Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 193 Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 194 Tl = VSUB(Tj, Tk);
Chris@82 195 T2e = VADD(Tj, Tk);
Chris@82 196 }
Chris@82 197 T2g = VADD(T2e, T2f);
Chris@82 198 T2j = VADD(T2h, T2i);
Chris@82 199 Ti = VFNMS(LDK(KP414213562), Th, Te);
Chris@82 200 Tp = VFNMS(LDK(KP414213562), To, Tl);
Chris@82 201 Tq = VADD(Ti, Tp);
Chris@82 202 T1B = VSUB(Tp, Ti);
Chris@82 203 {
Chris@82 204 V T17, T18, T2y, T2z;
Chris@82 205 T17 = VFMA(LDK(KP414213562), Te, Th);
Chris@82 206 T18 = VFMA(LDK(KP414213562), Tl, To);
Chris@82 207 T19 = VSUB(T17, T18);
Chris@82 208 T1q = VADD(T17, T18);
Chris@82 209 T2y = VSUB(T2h, T2i);
Chris@82 210 T2z = VSUB(T2e, T2f);
Chris@82 211 T2A = VADD(T2y, T2z);
Chris@82 212 T2L = VSUB(T2z, T2y);
Chris@82 213 }
Chris@82 214 }
Chris@82 215 {
Chris@82 216 V T2d, T2n, T2m, T2o;
Chris@82 217 {
Chris@82 218 V T1X, T2c, T2k, T2l;
Chris@82 219 T1X = VSUB(T1T, T1W);
Chris@82 220 T2c = VADD(T24, T2b);
Chris@82 221 T2d = VFNMS(LDK(KP707106781), T2c, T1X);
Chris@82 222 T2n = VFMA(LDK(KP707106781), T2c, T1X);
Chris@82 223 T2k = VSUB(T2g, T2j);
Chris@82 224 T2l = VSUB(T2b, T24);
Chris@82 225 T2m = VFNMS(LDK(KP707106781), T2l, T2k);
Chris@82 226 T2o = VFMA(LDK(KP707106781), T2l, T2k);
Chris@82 227 }
Chris@82 228 ST(&(xo[WS(os, 12)]), VFNMSI(T2m, T2d), ovs, &(xo[0]));
Chris@82 229 ST(&(xo[WS(os, 4)]), VFMAI(T2o, T2n), ovs, &(xo[0]));
Chris@82 230 ST(&(xo[WS(os, 20)]), VFMAI(T2m, T2d), ovs, &(xo[0]));
Chris@82 231 ST(&(xo[WS(os, 28)]), VFNMSI(T2o, T2n), ovs, &(xo[0]));
Chris@82 232 }
Chris@82 233 {
Chris@82 234 V T2r, T2v, T2u, T2w;
Chris@82 235 {
Chris@82 236 V T2p, T2q, T2s, T2t;
Chris@82 237 T2p = VADD(T1T, T1W);
Chris@82 238 T2q = VADD(T2j, T2g);
Chris@82 239 T2r = VADD(T2p, T2q);
Chris@82 240 T2v = VSUB(T2p, T2q);
Chris@82 241 T2s = VADD(T20, T23);
Chris@82 242 T2t = VADD(T27, T2a);
Chris@82 243 T2u = VADD(T2s, T2t);
Chris@82 244 T2w = VSUB(T2t, T2s);
Chris@82 245 }
Chris@82 246 ST(&(xo[WS(os, 16)]), VSUB(T2r, T2u), ovs, &(xo[0]));
Chris@82 247 ST(&(xo[WS(os, 8)]), VFMAI(T2w, T2v), ovs, &(xo[0]));
Chris@82 248 ST(&(xo[0]), VADD(T2r, T2u), ovs, &(xo[0]));
Chris@82 249 ST(&(xo[WS(os, 24)]), VFNMSI(T2w, T2v), ovs, &(xo[0]));
Chris@82 250 }
Chris@82 251 {
Chris@82 252 V T2V, T2Z, T2Y, T30;
Chris@82 253 {
Chris@82 254 V T2T, T2U, T2W, T2X;
Chris@82 255 T2T = VFNMS(LDK(KP707106781), T2A, T2x);
Chris@82 256 T2U = VADD(T2O, T2N);
Chris@82 257 T2V = VFNMS(LDK(KP923879532), T2U, T2T);
Chris@82 258 T2Z = VFMA(LDK(KP923879532), T2U, T2T);
Chris@82 259 T2W = VFNMS(LDK(KP707106781), T2L, T2K);
Chris@82 260 T2X = VSUB(T2H, T2E);
Chris@82 261 T2Y = VFMA(LDK(KP923879532), T2X, T2W);
Chris@82 262 T30 = VFNMS(LDK(KP923879532), T2X, T2W);
Chris@82 263 }
Chris@82 264 ST(&(xo[WS(os, 10)]), VFMAI(T2Y, T2V), ovs, &(xo[0]));
Chris@82 265 ST(&(xo[WS(os, 26)]), VFMAI(T30, T2Z), ovs, &(xo[0]));
Chris@82 266 ST(&(xo[WS(os, 22)]), VFNMSI(T2Y, T2V), ovs, &(xo[0]));
Chris@82 267 ST(&(xo[WS(os, 6)]), VFNMSI(T30, T2Z), ovs, &(xo[0]));
Chris@82 268 }
Chris@82 269 {
Chris@82 270 V T2J, T2R, T2Q, T2S;
Chris@82 271 {
Chris@82 272 V T2B, T2I, T2M, T2P;
Chris@82 273 T2B = VFMA(LDK(KP707106781), T2A, T2x);
Chris@82 274 T2I = VADD(T2E, T2H);
Chris@82 275 T2J = VFNMS(LDK(KP923879532), T2I, T2B);
Chris@82 276 T2R = VFMA(LDK(KP923879532), T2I, T2B);
Chris@82 277 T2M = VFMA(LDK(KP707106781), T2L, T2K);
Chris@82 278 T2P = VSUB(T2N, T2O);
Chris@82 279 T2Q = VFNMS(LDK(KP923879532), T2P, T2M);
Chris@82 280 T2S = VFMA(LDK(KP923879532), T2P, T2M);
Chris@82 281 }
Chris@82 282 ST(&(xo[WS(os, 14)]), VFNMSI(T2Q, T2J), ovs, &(xo[0]));
Chris@82 283 ST(&(xo[WS(os, 2)]), VFMAI(T2S, T2R), ovs, &(xo[0]));
Chris@82 284 ST(&(xo[WS(os, 18)]), VFMAI(T2Q, T2J), ovs, &(xo[0]));
Chris@82 285 ST(&(xo[WS(os, 30)]), VFNMSI(T2S, T2R), ovs, &(xo[0]));
Chris@82 286 }
Chris@82 287 {
Chris@82 288 V T1r, T1C, T1M, T1J, T1F, T1K, T1y, T1N;
Chris@82 289 T1r = VFMA(LDK(KP923879532), T1q, T1p);
Chris@82 290 T1C = VFMA(LDK(KP923879532), T1B, T1A);
Chris@82 291 T1M = VFNMS(LDK(KP923879532), T1B, T1A);
Chris@82 292 T1J = VFNMS(LDK(KP923879532), T1q, T1p);
Chris@82 293 {
Chris@82 294 V T1D, T1E, T1u, T1x;
Chris@82 295 T1D = VFNMS(LDK(KP668178637), T1s, T1t);
Chris@82 296 T1E = VFNMS(LDK(KP668178637), T1v, T1w);
Chris@82 297 T1F = VSUB(T1D, T1E);
Chris@82 298 T1K = VADD(T1D, T1E);
Chris@82 299 T1u = VFMA(LDK(KP668178637), T1t, T1s);
Chris@82 300 T1x = VFMA(LDK(KP668178637), T1w, T1v);
Chris@82 301 T1y = VADD(T1u, T1x);
Chris@82 302 T1N = VSUB(T1x, T1u);
Chris@82 303 }
Chris@82 304 {
Chris@82 305 V T1z, T1G, T1P, T1Q;
Chris@82 306 T1z = VFNMS(LDK(KP831469612), T1y, T1r);
Chris@82 307 T1G = VFNMS(LDK(KP831469612), T1F, T1C);
Chris@82 308 ST(&(xo[WS(os, 13)]), VFNMSI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
Chris@82 309 ST(&(xo[WS(os, 19)]), VFMAI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
Chris@82 310 T1P = VFNMS(LDK(KP831469612), T1K, T1J);
Chris@82 311 T1Q = VFNMS(LDK(KP831469612), T1N, T1M);
Chris@82 312 ST(&(xo[WS(os, 5)]), VFNMSI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@82 313 ST(&(xo[WS(os, 27)]), VFMAI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@82 314 }
Chris@82 315 {
Chris@82 316 V T1H, T1I, T1L, T1O;
Chris@82 317 T1H = VFMA(LDK(KP831469612), T1y, T1r);
Chris@82 318 T1I = VFMA(LDK(KP831469612), T1F, T1C);
Chris@82 319 ST(&(xo[WS(os, 29)]), VFNMSI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
Chris@82 320 ST(&(xo[WS(os, 3)]), VFMAI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
Chris@82 321 T1L = VFMA(LDK(KP831469612), T1K, T1J);
Chris@82 322 T1O = VFMA(LDK(KP831469612), T1N, T1M);
Chris@82 323 ST(&(xo[WS(os, 11)]), VFMAI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@82 324 ST(&(xo[WS(os, 21)]), VFNMSI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@82 325 }
Chris@82 326 }
Chris@82 327 {
Chris@82 328 V Tr, T1a, T1k, T1h, T1d, T1i, T10, T1l;
Chris@82 329 Tr = VFMA(LDK(KP923879532), Tq, Tb);
Chris@82 330 T1a = VFMA(LDK(KP923879532), T19, T16);
Chris@82 331 T1k = VFNMS(LDK(KP923879532), T19, T16);
Chris@82 332 T1h = VFNMS(LDK(KP923879532), Tq, Tb);
Chris@82 333 {
Chris@82 334 V T1b, T1c, TI, TZ;
Chris@82 335 T1b = VFMA(LDK(KP198912367), TC, TH);
Chris@82 336 T1c = VFMA(LDK(KP198912367), TT, TY);
Chris@82 337 T1d = VSUB(T1b, T1c);
Chris@82 338 T1i = VADD(T1b, T1c);
Chris@82 339 TI = VFNMS(LDK(KP198912367), TH, TC);
Chris@82 340 TZ = VFNMS(LDK(KP198912367), TY, TT);
Chris@82 341 T10 = VADD(TI, TZ);
Chris@82 342 T1l = VSUB(TZ, TI);
Chris@82 343 }
Chris@82 344 {
Chris@82 345 V T11, T1e, T1n, T1o;
Chris@82 346 T11 = VFNMS(LDK(KP980785280), T10, Tr);
Chris@82 347 T1e = VFNMS(LDK(KP980785280), T1d, T1a);
Chris@82 348 ST(&(xo[WS(os, 17)]), VFNMSI(T1e, T11), ovs, &(xo[WS(os, 1)]));
Chris@82 349 ST(&(xo[WS(os, 15)]), VFMAI(T1e, T11), ovs, &(xo[WS(os, 1)]));
Chris@82 350 T1n = VFMA(LDK(KP980785280), T1i, T1h);
Chris@82 351 T1o = VFMA(LDK(KP980785280), T1l, T1k);
Chris@82 352 ST(&(xo[WS(os, 7)]), VFMAI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@82 353 ST(&(xo[WS(os, 25)]), VFNMSI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@82 354 }
Chris@82 355 {
Chris@82 356 V T1f, T1g, T1j, T1m;
Chris@82 357 T1f = VFMA(LDK(KP980785280), T10, Tr);
Chris@82 358 T1g = VFMA(LDK(KP980785280), T1d, T1a);
Chris@82 359 ST(&(xo[WS(os, 1)]), VFNMSI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
Chris@82 360 ST(&(xo[WS(os, 31)]), VFMAI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
Chris@82 361 T1j = VFNMS(LDK(KP980785280), T1i, T1h);
Chris@82 362 T1m = VFNMS(LDK(KP980785280), T1l, T1k);
Chris@82 363 ST(&(xo[WS(os, 9)]), VFNMSI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@82 364 ST(&(xo[WS(os, 23)]), VFMAI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@82 365 }
Chris@82 366 }
Chris@82 367 }
Chris@82 368 }
Chris@82 369 VLEAVE();
Chris@82 370 }
Chris@82 371
Chris@82 372 static const kdft_desc desc = { 32, XSIMD_STRING("n1fv_32"), {88, 0, 98, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 373
Chris@82 374 void XSIMD(codelet_n1fv_32) (planner *p) {
Chris@82 375 X(kdft_register) (p, n1fv_32, &desc);
Chris@82 376 }
Chris@82 377
Chris@82 378 #else
Chris@82 379
Chris@82 380 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n1fv_32 -include dft/simd/n1f.h */
Chris@82 381
Chris@82 382 /*
Chris@82 383 * This function contains 186 FP additions, 42 FP multiplications,
Chris@82 384 * (or, 170 additions, 26 multiplications, 16 fused multiply/add),
Chris@82 385 * 58 stack variables, 7 constants, and 64 memory accesses
Chris@82 386 */
Chris@82 387 #include "dft/simd/n1f.h"
Chris@82 388
Chris@82 389 static void n1fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 390 {
Chris@82 391 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 392 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 393 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 394 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 395 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 396 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 397 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 398 {
Chris@82 399 INT i;
Chris@82 400 const R *xi;
Chris@82 401 R *xo;
Chris@82 402 xi = ri;
Chris@82 403 xo = ro;
Chris@82 404 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@82 405 V T1T, T1W, T2K, T2x, T16, T1A, Tb, T1p, TT, T1v, TY, T1w, T27, T2a, T2b;
Chris@82 406 V T2H, T2O, TC, T1s, TH, T1t, T20, T23, T24, T2E, T2N, T2g, T2j, Tq, T1B;
Chris@82 407 V T19, T1q, T2A, T2L;
Chris@82 408 {
Chris@82 409 V T3, T1R, T15, T1S, T6, T1U, T9, T1V, T12, Ta;
Chris@82 410 {
Chris@82 411 V T1, T2, T13, T14;
Chris@82 412 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 413 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 414 T3 = VSUB(T1, T2);
Chris@82 415 T1R = VADD(T1, T2);
Chris@82 416 T13 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 417 T14 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 418 T15 = VSUB(T13, T14);
Chris@82 419 T1S = VADD(T13, T14);
Chris@82 420 }
Chris@82 421 {
Chris@82 422 V T4, T5, T7, T8;
Chris@82 423 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 424 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 425 T6 = VSUB(T4, T5);
Chris@82 426 T1U = VADD(T4, T5);
Chris@82 427 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 428 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 429 T9 = VSUB(T7, T8);
Chris@82 430 T1V = VADD(T7, T8);
Chris@82 431 }
Chris@82 432 T1T = VADD(T1R, T1S);
Chris@82 433 T1W = VADD(T1U, T1V);
Chris@82 434 T2K = VSUB(T1V, T1U);
Chris@82 435 T2x = VSUB(T1R, T1S);
Chris@82 436 T12 = VMUL(LDK(KP707106781), VSUB(T9, T6));
Chris@82 437 T16 = VSUB(T12, T15);
Chris@82 438 T1A = VADD(T15, T12);
Chris@82 439 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@82 440 Tb = VADD(T3, Ta);
Chris@82 441 T1p = VSUB(T3, Ta);
Chris@82 442 }
Chris@82 443 {
Chris@82 444 V TL, T25, TX, T26, TO, T28, TR, T29;
Chris@82 445 {
Chris@82 446 V TJ, TK, TV, TW;
Chris@82 447 TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 448 TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 449 TL = VSUB(TJ, TK);
Chris@82 450 T25 = VADD(TJ, TK);
Chris@82 451 TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 452 TW = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 453 TX = VSUB(TV, TW);
Chris@82 454 T26 = VADD(TV, TW);
Chris@82 455 }
Chris@82 456 {
Chris@82 457 V TM, TN, TP, TQ;
Chris@82 458 TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 459 TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 460 TO = VSUB(TM, TN);
Chris@82 461 T28 = VADD(TM, TN);
Chris@82 462 TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 463 TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 464 TR = VSUB(TP, TQ);
Chris@82 465 T29 = VADD(TP, TQ);
Chris@82 466 }
Chris@82 467 {
Chris@82 468 V TS, TU, T2F, T2G;
Chris@82 469 TS = VMUL(LDK(KP707106781), VADD(TO, TR));
Chris@82 470 TT = VADD(TL, TS);
Chris@82 471 T1v = VSUB(TL, TS);
Chris@82 472 TU = VMUL(LDK(KP707106781), VSUB(TR, TO));
Chris@82 473 TY = VSUB(TU, TX);
Chris@82 474 T1w = VADD(TX, TU);
Chris@82 475 T27 = VADD(T25, T26);
Chris@82 476 T2a = VADD(T28, T29);
Chris@82 477 T2b = VSUB(T27, T2a);
Chris@82 478 T2F = VSUB(T25, T26);
Chris@82 479 T2G = VSUB(T29, T28);
Chris@82 480 T2H = VFNMS(LDK(KP382683432), T2G, VMUL(LDK(KP923879532), T2F));
Chris@82 481 T2O = VFMA(LDK(KP382683432), T2F, VMUL(LDK(KP923879532), T2G));
Chris@82 482 }
Chris@82 483 }
Chris@82 484 {
Chris@82 485 V Tu, T1Y, TG, T1Z, Tx, T21, TA, T22;
Chris@82 486 {
Chris@82 487 V Ts, Tt, TE, TF;
Chris@82 488 Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 489 Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 490 Tu = VSUB(Ts, Tt);
Chris@82 491 T1Y = VADD(Ts, Tt);
Chris@82 492 TE = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 493 TF = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 494 TG = VSUB(TE, TF);
Chris@82 495 T1Z = VADD(TE, TF);
Chris@82 496 }
Chris@82 497 {
Chris@82 498 V Tv, Tw, Ty, Tz;
Chris@82 499 Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 500 Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 501 Tx = VSUB(Tv, Tw);
Chris@82 502 T21 = VADD(Tv, Tw);
Chris@82 503 Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 504 Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 505 TA = VSUB(Ty, Tz);
Chris@82 506 T22 = VADD(Ty, Tz);
Chris@82 507 }
Chris@82 508 {
Chris@82 509 V TB, TD, T2C, T2D;
Chris@82 510 TB = VMUL(LDK(KP707106781), VADD(Tx, TA));
Chris@82 511 TC = VADD(Tu, TB);
Chris@82 512 T1s = VSUB(Tu, TB);
Chris@82 513 TD = VMUL(LDK(KP707106781), VSUB(TA, Tx));
Chris@82 514 TH = VSUB(TD, TG);
Chris@82 515 T1t = VADD(TG, TD);
Chris@82 516 T20 = VADD(T1Y, T1Z);
Chris@82 517 T23 = VADD(T21, T22);
Chris@82 518 T24 = VSUB(T20, T23);
Chris@82 519 T2C = VSUB(T1Y, T1Z);
Chris@82 520 T2D = VSUB(T22, T21);
Chris@82 521 T2E = VFMA(LDK(KP923879532), T2C, VMUL(LDK(KP382683432), T2D));
Chris@82 522 T2N = VFNMS(LDK(KP382683432), T2C, VMUL(LDK(KP923879532), T2D));
Chris@82 523 }
Chris@82 524 }
Chris@82 525 {
Chris@82 526 V Te, T2h, To, T2f, Th, T2i, Tl, T2e, Ti, Tp;
Chris@82 527 {
Chris@82 528 V Tc, Td, Tm, Tn;
Chris@82 529 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 530 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 531 Te = VSUB(Tc, Td);
Chris@82 532 T2h = VADD(Tc, Td);
Chris@82 533 Tm = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 534 Tn = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 535 To = VSUB(Tm, Tn);
Chris@82 536 T2f = VADD(Tm, Tn);
Chris@82 537 }
Chris@82 538 {
Chris@82 539 V Tf, Tg, Tj, Tk;
Chris@82 540 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 541 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 542 Th = VSUB(Tf, Tg);
Chris@82 543 T2i = VADD(Tf, Tg);
Chris@82 544 Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 545 Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 546 Tl = VSUB(Tj, Tk);
Chris@82 547 T2e = VADD(Tj, Tk);
Chris@82 548 }
Chris@82 549 T2g = VADD(T2e, T2f);
Chris@82 550 T2j = VADD(T2h, T2i);
Chris@82 551 Ti = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@82 552 Tp = VFMA(LDK(KP923879532), Tl, VMUL(LDK(KP382683432), To));
Chris@82 553 Tq = VADD(Ti, Tp);
Chris@82 554 T1B = VSUB(Tp, Ti);
Chris@82 555 {
Chris@82 556 V T17, T18, T2y, T2z;
Chris@82 557 T17 = VFNMS(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@82 558 T18 = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@82 559 T19 = VSUB(T17, T18);
Chris@82 560 T1q = VADD(T18, T17);
Chris@82 561 T2y = VSUB(T2h, T2i);
Chris@82 562 T2z = VSUB(T2e, T2f);
Chris@82 563 T2A = VMUL(LDK(KP707106781), VADD(T2y, T2z));
Chris@82 564 T2L = VMUL(LDK(KP707106781), VSUB(T2z, T2y));
Chris@82 565 }
Chris@82 566 }
Chris@82 567 {
Chris@82 568 V T2d, T2n, T2m, T2o;
Chris@82 569 {
Chris@82 570 V T1X, T2c, T2k, T2l;
Chris@82 571 T1X = VSUB(T1T, T1W);
Chris@82 572 T2c = VMUL(LDK(KP707106781), VADD(T24, T2b));
Chris@82 573 T2d = VADD(T1X, T2c);
Chris@82 574 T2n = VSUB(T1X, T2c);
Chris@82 575 T2k = VSUB(T2g, T2j);
Chris@82 576 T2l = VMUL(LDK(KP707106781), VSUB(T2b, T24));
Chris@82 577 T2m = VBYI(VADD(T2k, T2l));
Chris@82 578 T2o = VBYI(VSUB(T2l, T2k));
Chris@82 579 }
Chris@82 580 ST(&(xo[WS(os, 28)]), VSUB(T2d, T2m), ovs, &(xo[0]));
Chris@82 581 ST(&(xo[WS(os, 12)]), VADD(T2n, T2o), ovs, &(xo[0]));
Chris@82 582 ST(&(xo[WS(os, 4)]), VADD(T2d, T2m), ovs, &(xo[0]));
Chris@82 583 ST(&(xo[WS(os, 20)]), VSUB(T2n, T2o), ovs, &(xo[0]));
Chris@82 584 }
Chris@82 585 {
Chris@82 586 V T2r, T2v, T2u, T2w;
Chris@82 587 {
Chris@82 588 V T2p, T2q, T2s, T2t;
Chris@82 589 T2p = VADD(T1T, T1W);
Chris@82 590 T2q = VADD(T2j, T2g);
Chris@82 591 T2r = VADD(T2p, T2q);
Chris@82 592 T2v = VSUB(T2p, T2q);
Chris@82 593 T2s = VADD(T20, T23);
Chris@82 594 T2t = VADD(T27, T2a);
Chris@82 595 T2u = VADD(T2s, T2t);
Chris@82 596 T2w = VBYI(VSUB(T2t, T2s));
Chris@82 597 }
Chris@82 598 ST(&(xo[WS(os, 16)]), VSUB(T2r, T2u), ovs, &(xo[0]));
Chris@82 599 ST(&(xo[WS(os, 8)]), VADD(T2v, T2w), ovs, &(xo[0]));
Chris@82 600 ST(&(xo[0]), VADD(T2r, T2u), ovs, &(xo[0]));
Chris@82 601 ST(&(xo[WS(os, 24)]), VSUB(T2v, T2w), ovs, &(xo[0]));
Chris@82 602 }
Chris@82 603 {
Chris@82 604 V T2V, T2Z, T2Y, T30;
Chris@82 605 {
Chris@82 606 V T2T, T2U, T2W, T2X;
Chris@82 607 T2T = VSUB(T2H, T2E);
Chris@82 608 T2U = VSUB(T2L, T2K);
Chris@82 609 T2V = VBYI(VSUB(T2T, T2U));
Chris@82 610 T2Z = VBYI(VADD(T2U, T2T));
Chris@82 611 T2W = VSUB(T2x, T2A);
Chris@82 612 T2X = VSUB(T2O, T2N);
Chris@82 613 T2Y = VSUB(T2W, T2X);
Chris@82 614 T30 = VADD(T2W, T2X);
Chris@82 615 }
Chris@82 616 ST(&(xo[WS(os, 10)]), VADD(T2V, T2Y), ovs, &(xo[0]));
Chris@82 617 ST(&(xo[WS(os, 26)]), VSUB(T30, T2Z), ovs, &(xo[0]));
Chris@82 618 ST(&(xo[WS(os, 22)]), VSUB(T2Y, T2V), ovs, &(xo[0]));
Chris@82 619 ST(&(xo[WS(os, 6)]), VADD(T2Z, T30), ovs, &(xo[0]));
Chris@82 620 }
Chris@82 621 {
Chris@82 622 V T2J, T2R, T2Q, T2S;
Chris@82 623 {
Chris@82 624 V T2B, T2I, T2M, T2P;
Chris@82 625 T2B = VADD(T2x, T2A);
Chris@82 626 T2I = VADD(T2E, T2H);
Chris@82 627 T2J = VADD(T2B, T2I);
Chris@82 628 T2R = VSUB(T2B, T2I);
Chris@82 629 T2M = VADD(T2K, T2L);
Chris@82 630 T2P = VADD(T2N, T2O);
Chris@82 631 T2Q = VBYI(VADD(T2M, T2P));
Chris@82 632 T2S = VBYI(VSUB(T2P, T2M));
Chris@82 633 }
Chris@82 634 ST(&(xo[WS(os, 30)]), VSUB(T2J, T2Q), ovs, &(xo[0]));
Chris@82 635 ST(&(xo[WS(os, 14)]), VADD(T2R, T2S), ovs, &(xo[0]));
Chris@82 636 ST(&(xo[WS(os, 2)]), VADD(T2J, T2Q), ovs, &(xo[0]));
Chris@82 637 ST(&(xo[WS(os, 18)]), VSUB(T2R, T2S), ovs, &(xo[0]));
Chris@82 638 }
Chris@82 639 {
Chris@82 640 V T1r, T1C, T1M, T1K, T1F, T1N, T1y, T1J;
Chris@82 641 T1r = VADD(T1p, T1q);
Chris@82 642 T1C = VADD(T1A, T1B);
Chris@82 643 T1M = VSUB(T1p, T1q);
Chris@82 644 T1K = VSUB(T1B, T1A);
Chris@82 645 {
Chris@82 646 V T1D, T1E, T1u, T1x;
Chris@82 647 T1D = VFNMS(LDK(KP555570233), T1s, VMUL(LDK(KP831469612), T1t));
Chris@82 648 T1E = VFMA(LDK(KP555570233), T1v, VMUL(LDK(KP831469612), T1w));
Chris@82 649 T1F = VADD(T1D, T1E);
Chris@82 650 T1N = VSUB(T1E, T1D);
Chris@82 651 T1u = VFMA(LDK(KP831469612), T1s, VMUL(LDK(KP555570233), T1t));
Chris@82 652 T1x = VFNMS(LDK(KP555570233), T1w, VMUL(LDK(KP831469612), T1v));
Chris@82 653 T1y = VADD(T1u, T1x);
Chris@82 654 T1J = VSUB(T1x, T1u);
Chris@82 655 }
Chris@82 656 {
Chris@82 657 V T1z, T1G, T1P, T1Q;
Chris@82 658 T1z = VADD(T1r, T1y);
Chris@82 659 T1G = VBYI(VADD(T1C, T1F));
Chris@82 660 ST(&(xo[WS(os, 29)]), VSUB(T1z, T1G), ovs, &(xo[WS(os, 1)]));
Chris@82 661 ST(&(xo[WS(os, 3)]), VADD(T1z, T1G), ovs, &(xo[WS(os, 1)]));
Chris@82 662 T1P = VBYI(VADD(T1K, T1J));
Chris@82 663 T1Q = VADD(T1M, T1N);
Chris@82 664 ST(&(xo[WS(os, 5)]), VADD(T1P, T1Q), ovs, &(xo[WS(os, 1)]));
Chris@82 665 ST(&(xo[WS(os, 27)]), VSUB(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@82 666 }
Chris@82 667 {
Chris@82 668 V T1H, T1I, T1L, T1O;
Chris@82 669 T1H = VSUB(T1r, T1y);
Chris@82 670 T1I = VBYI(VSUB(T1F, T1C));
Chris@82 671 ST(&(xo[WS(os, 19)]), VSUB(T1H, T1I), ovs, &(xo[WS(os, 1)]));
Chris@82 672 ST(&(xo[WS(os, 13)]), VADD(T1H, T1I), ovs, &(xo[WS(os, 1)]));
Chris@82 673 T1L = VBYI(VSUB(T1J, T1K));
Chris@82 674 T1O = VSUB(T1M, T1N);
Chris@82 675 ST(&(xo[WS(os, 11)]), VADD(T1L, T1O), ovs, &(xo[WS(os, 1)]));
Chris@82 676 ST(&(xo[WS(os, 21)]), VSUB(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@82 677 }
Chris@82 678 }
Chris@82 679 {
Chris@82 680 V Tr, T1a, T1k, T1i, T1d, T1l, T10, T1h;
Chris@82 681 Tr = VADD(Tb, Tq);
Chris@82 682 T1a = VADD(T16, T19);
Chris@82 683 T1k = VSUB(Tb, Tq);
Chris@82 684 T1i = VSUB(T19, T16);
Chris@82 685 {
Chris@82 686 V T1b, T1c, TI, TZ;
Chris@82 687 T1b = VFNMS(LDK(KP195090322), TC, VMUL(LDK(KP980785280), TH));
Chris@82 688 T1c = VFMA(LDK(KP195090322), TT, VMUL(LDK(KP980785280), TY));
Chris@82 689 T1d = VADD(T1b, T1c);
Chris@82 690 T1l = VSUB(T1c, T1b);
Chris@82 691 TI = VFMA(LDK(KP980785280), TC, VMUL(LDK(KP195090322), TH));
Chris@82 692 TZ = VFNMS(LDK(KP195090322), TY, VMUL(LDK(KP980785280), TT));
Chris@82 693 T10 = VADD(TI, TZ);
Chris@82 694 T1h = VSUB(TZ, TI);
Chris@82 695 }
Chris@82 696 {
Chris@82 697 V T11, T1e, T1n, T1o;
Chris@82 698 T11 = VADD(Tr, T10);
Chris@82 699 T1e = VBYI(VADD(T1a, T1d));
Chris@82 700 ST(&(xo[WS(os, 31)]), VSUB(T11, T1e), ovs, &(xo[WS(os, 1)]));
Chris@82 701 ST(&(xo[WS(os, 1)]), VADD(T11, T1e), ovs, &(xo[WS(os, 1)]));
Chris@82 702 T1n = VBYI(VADD(T1i, T1h));
Chris@82 703 T1o = VADD(T1k, T1l);
Chris@82 704 ST(&(xo[WS(os, 7)]), VADD(T1n, T1o), ovs, &(xo[WS(os, 1)]));
Chris@82 705 ST(&(xo[WS(os, 25)]), VSUB(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@82 706 }
Chris@82 707 {
Chris@82 708 V T1f, T1g, T1j, T1m;
Chris@82 709 T1f = VSUB(Tr, T10);
Chris@82 710 T1g = VBYI(VSUB(T1d, T1a));
Chris@82 711 ST(&(xo[WS(os, 17)]), VSUB(T1f, T1g), ovs, &(xo[WS(os, 1)]));
Chris@82 712 ST(&(xo[WS(os, 15)]), VADD(T1f, T1g), ovs, &(xo[WS(os, 1)]));
Chris@82 713 T1j = VBYI(VSUB(T1h, T1i));
Chris@82 714 T1m = VSUB(T1k, T1l);
Chris@82 715 ST(&(xo[WS(os, 9)]), VADD(T1j, T1m), ovs, &(xo[WS(os, 1)]));
Chris@82 716 ST(&(xo[WS(os, 23)]), VSUB(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@82 717 }
Chris@82 718 }
Chris@82 719 }
Chris@82 720 }
Chris@82 721 VLEAVE();
Chris@82 722 }
Chris@82 723
Chris@82 724 static const kdft_desc desc = { 32, XSIMD_STRING("n1fv_32"), {170, 26, 16, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 725
Chris@82 726 void XSIMD(codelet_n1fv_32) (planner *p) {
Chris@82 727 X(kdft_register) (p, n1fv_32, &desc);
Chris@82 728 }
Chris@82 729
Chris@82 730 #endif