annotate src/fftw-3.3.8/dft/simd/common/n1bv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:58 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 32 -name n1bv_32 -include dft/simd/n1b.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 186 FP additions, 98 FP multiplications,
Chris@82 32 * (or, 88 additions, 0 multiplications, 98 fused multiply/add),
Chris@82 33 * 58 stack variables, 7 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n1b.h"
Chris@82 36
Chris@82 37 static void n1bv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 40 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 41 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 42 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 44 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 46 {
Chris@82 47 INT i;
Chris@82 48 const R *xi;
Chris@82 49 R *xo;
Chris@82 50 xi = ii;
Chris@82 51 xo = io;
Chris@82 52 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@82 53 V T1T, T1W, T2K, T2x, T16, T1A, Tb, T1p, TT, T1v, TY, T1w, T27, T2a, T2b;
Chris@82 54 V T2H, T2O, TC, T1s, TH, T1t, T20, T23, T24, T2E, T2N, T2g, T2j, Tq, T1B;
Chris@82 55 V T19, T1q, T2A, T2L;
Chris@82 56 {
Chris@82 57 V T3, T1R, T14, T1S, T6, T1U, T9, T1V, T15, Ta;
Chris@82 58 {
Chris@82 59 V T1, T2, T12, T13;
Chris@82 60 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 61 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 62 T3 = VSUB(T1, T2);
Chris@82 63 T1R = VADD(T1, T2);
Chris@82 64 T12 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 65 T13 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 66 T14 = VSUB(T12, T13);
Chris@82 67 T1S = VADD(T12, T13);
Chris@82 68 }
Chris@82 69 {
Chris@82 70 V T4, T5, T7, T8;
Chris@82 71 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 72 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 73 T6 = VSUB(T4, T5);
Chris@82 74 T1U = VADD(T4, T5);
Chris@82 75 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 76 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 77 T9 = VSUB(T7, T8);
Chris@82 78 T1V = VADD(T7, T8);
Chris@82 79 }
Chris@82 80 T1T = VADD(T1R, T1S);
Chris@82 81 T1W = VADD(T1U, T1V);
Chris@82 82 T2K = VSUB(T1U, T1V);
Chris@82 83 T2x = VSUB(T1R, T1S);
Chris@82 84 T15 = VSUB(T6, T9);
Chris@82 85 T16 = VFMA(LDK(KP707106781), T15, T14);
Chris@82 86 T1A = VFNMS(LDK(KP707106781), T15, T14);
Chris@82 87 Ta = VADD(T6, T9);
Chris@82 88 Tb = VFMA(LDK(KP707106781), Ta, T3);
Chris@82 89 T1p = VFNMS(LDK(KP707106781), Ta, T3);
Chris@82 90 }
Chris@82 91 {
Chris@82 92 V TL, T25, TW, T26, TO, T28, TR, T29;
Chris@82 93 {
Chris@82 94 V TJ, TK, TU, TV;
Chris@82 95 TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 96 TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 97 TL = VSUB(TJ, TK);
Chris@82 98 T25 = VADD(TJ, TK);
Chris@82 99 TU = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 100 TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 101 TW = VSUB(TU, TV);
Chris@82 102 T26 = VADD(TV, TU);
Chris@82 103 }
Chris@82 104 {
Chris@82 105 V TM, TN, TP, TQ;
Chris@82 106 TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 107 TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 108 TO = VSUB(TM, TN);
Chris@82 109 T28 = VADD(TM, TN);
Chris@82 110 TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 111 TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 112 TR = VSUB(TP, TQ);
Chris@82 113 T29 = VADD(TP, TQ);
Chris@82 114 }
Chris@82 115 {
Chris@82 116 V TS, TX, T2F, T2G;
Chris@82 117 TS = VADD(TO, TR);
Chris@82 118 TT = VFMA(LDK(KP707106781), TS, TL);
Chris@82 119 T1v = VFNMS(LDK(KP707106781), TS, TL);
Chris@82 120 TX = VSUB(TR, TO);
Chris@82 121 TY = VFMA(LDK(KP707106781), TX, TW);
Chris@82 122 T1w = VFNMS(LDK(KP707106781), TX, TW);
Chris@82 123 T27 = VADD(T25, T26);
Chris@82 124 T2a = VADD(T28, T29);
Chris@82 125 T2b = VSUB(T27, T2a);
Chris@82 126 T2F = VSUB(T25, T26);
Chris@82 127 T2G = VSUB(T29, T28);
Chris@82 128 T2H = VFNMS(LDK(KP414213562), T2G, T2F);
Chris@82 129 T2O = VFMA(LDK(KP414213562), T2F, T2G);
Chris@82 130 }
Chris@82 131 }
Chris@82 132 {
Chris@82 133 V Tu, T1Y, TF, T1Z, Tx, T21, TA, T22;
Chris@82 134 {
Chris@82 135 V Ts, Tt, TD, TE;
Chris@82 136 Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 137 Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 138 Tu = VSUB(Ts, Tt);
Chris@82 139 T1Y = VADD(Ts, Tt);
Chris@82 140 TD = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 141 TE = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 142 TF = VSUB(TD, TE);
Chris@82 143 T1Z = VADD(TD, TE);
Chris@82 144 }
Chris@82 145 {
Chris@82 146 V Tv, Tw, Ty, Tz;
Chris@82 147 Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 148 Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 149 Tx = VSUB(Tv, Tw);
Chris@82 150 T21 = VADD(Tv, Tw);
Chris@82 151 Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 152 Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 153 TA = VSUB(Ty, Tz);
Chris@82 154 T22 = VADD(Ty, Tz);
Chris@82 155 }
Chris@82 156 {
Chris@82 157 V TB, TG, T2C, T2D;
Chris@82 158 TB = VADD(Tx, TA);
Chris@82 159 TC = VFMA(LDK(KP707106781), TB, Tu);
Chris@82 160 T1s = VFNMS(LDK(KP707106781), TB, Tu);
Chris@82 161 TG = VSUB(Tx, TA);
Chris@82 162 TH = VFMA(LDK(KP707106781), TG, TF);
Chris@82 163 T1t = VFNMS(LDK(KP707106781), TG, TF);
Chris@82 164 T20 = VADD(T1Y, T1Z);
Chris@82 165 T23 = VADD(T21, T22);
Chris@82 166 T24 = VSUB(T20, T23);
Chris@82 167 T2C = VSUB(T1Y, T1Z);
Chris@82 168 T2D = VSUB(T21, T22);
Chris@82 169 T2E = VFNMS(LDK(KP414213562), T2D, T2C);
Chris@82 170 T2N = VFMA(LDK(KP414213562), T2C, T2D);
Chris@82 171 }
Chris@82 172 }
Chris@82 173 {
Chris@82 174 V Te, T2e, To, T2i, Th, T2f, Tl, T2h, Ti, Tp;
Chris@82 175 {
Chris@82 176 V Tc, Td, Tm, Tn;
Chris@82 177 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 178 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 179 Te = VSUB(Tc, Td);
Chris@82 180 T2e = VADD(Tc, Td);
Chris@82 181 Tm = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 182 Tn = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 183 To = VSUB(Tm, Tn);
Chris@82 184 T2i = VADD(Tn, Tm);
Chris@82 185 }
Chris@82 186 {
Chris@82 187 V Tf, Tg, Tj, Tk;
Chris@82 188 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 189 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 190 Th = VSUB(Tf, Tg);
Chris@82 191 T2f = VADD(Tf, Tg);
Chris@82 192 Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 193 Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 194 Tl = VSUB(Tj, Tk);
Chris@82 195 T2h = VADD(Tj, Tk);
Chris@82 196 }
Chris@82 197 T2g = VADD(T2e, T2f);
Chris@82 198 T2j = VADD(T2h, T2i);
Chris@82 199 Ti = VFNMS(LDK(KP414213562), Th, Te);
Chris@82 200 Tp = VFNMS(LDK(KP414213562), To, Tl);
Chris@82 201 Tq = VADD(Ti, Tp);
Chris@82 202 T1B = VSUB(Ti, Tp);
Chris@82 203 {
Chris@82 204 V T17, T18, T2y, T2z;
Chris@82 205 T17 = VFMA(LDK(KP414213562), Te, Th);
Chris@82 206 T18 = VFMA(LDK(KP414213562), Tl, To);
Chris@82 207 T19 = VSUB(T17, T18);
Chris@82 208 T1q = VADD(T17, T18);
Chris@82 209 T2y = VSUB(T2e, T2f);
Chris@82 210 T2z = VSUB(T2h, T2i);
Chris@82 211 T2A = VADD(T2y, T2z);
Chris@82 212 T2L = VSUB(T2y, T2z);
Chris@82 213 }
Chris@82 214 }
Chris@82 215 {
Chris@82 216 V T2d, T2n, T2m, T2o;
Chris@82 217 {
Chris@82 218 V T1X, T2c, T2k, T2l;
Chris@82 219 T1X = VSUB(T1T, T1W);
Chris@82 220 T2c = VADD(T24, T2b);
Chris@82 221 T2d = VFNMS(LDK(KP707106781), T2c, T1X);
Chris@82 222 T2n = VFMA(LDK(KP707106781), T2c, T1X);
Chris@82 223 T2k = VSUB(T2g, T2j);
Chris@82 224 T2l = VSUB(T24, T2b);
Chris@82 225 T2m = VFNMS(LDK(KP707106781), T2l, T2k);
Chris@82 226 T2o = VFMA(LDK(KP707106781), T2l, T2k);
Chris@82 227 }
Chris@82 228 ST(&(xo[WS(os, 12)]), VFNMSI(T2m, T2d), ovs, &(xo[0]));
Chris@82 229 ST(&(xo[WS(os, 28)]), VFNMSI(T2o, T2n), ovs, &(xo[0]));
Chris@82 230 ST(&(xo[WS(os, 20)]), VFMAI(T2m, T2d), ovs, &(xo[0]));
Chris@82 231 ST(&(xo[WS(os, 4)]), VFMAI(T2o, T2n), ovs, &(xo[0]));
Chris@82 232 }
Chris@82 233 {
Chris@82 234 V T2r, T2v, T2u, T2w;
Chris@82 235 {
Chris@82 236 V T2p, T2q, T2s, T2t;
Chris@82 237 T2p = VADD(T1T, T1W);
Chris@82 238 T2q = VADD(T2g, T2j);
Chris@82 239 T2r = VSUB(T2p, T2q);
Chris@82 240 T2v = VADD(T2p, T2q);
Chris@82 241 T2s = VADD(T20, T23);
Chris@82 242 T2t = VADD(T27, T2a);
Chris@82 243 T2u = VSUB(T2s, T2t);
Chris@82 244 T2w = VADD(T2s, T2t);
Chris@82 245 }
Chris@82 246 ST(&(xo[WS(os, 24)]), VFNMSI(T2u, T2r), ovs, &(xo[0]));
Chris@82 247 ST(&(xo[0]), VADD(T2v, T2w), ovs, &(xo[0]));
Chris@82 248 ST(&(xo[WS(os, 8)]), VFMAI(T2u, T2r), ovs, &(xo[0]));
Chris@82 249 ST(&(xo[WS(os, 16)]), VSUB(T2v, T2w), ovs, &(xo[0]));
Chris@82 250 }
Chris@82 251 {
Chris@82 252 V T2V, T2Z, T2Y, T30;
Chris@82 253 {
Chris@82 254 V T2T, T2U, T2W, T2X;
Chris@82 255 T2T = VFNMS(LDK(KP707106781), T2A, T2x);
Chris@82 256 T2U = VADD(T2N, T2O);
Chris@82 257 T2V = VFNMS(LDK(KP923879532), T2U, T2T);
Chris@82 258 T2Z = VFMA(LDK(KP923879532), T2U, T2T);
Chris@82 259 T2W = VFNMS(LDK(KP707106781), T2L, T2K);
Chris@82 260 T2X = VSUB(T2E, T2H);
Chris@82 261 T2Y = VFMA(LDK(KP923879532), T2X, T2W);
Chris@82 262 T30 = VFNMS(LDK(KP923879532), T2X, T2W);
Chris@82 263 }
Chris@82 264 ST(&(xo[WS(os, 10)]), VFMAI(T2Y, T2V), ovs, &(xo[0]));
Chris@82 265 ST(&(xo[WS(os, 26)]), VFMAI(T30, T2Z), ovs, &(xo[0]));
Chris@82 266 ST(&(xo[WS(os, 22)]), VFNMSI(T2Y, T2V), ovs, &(xo[0]));
Chris@82 267 ST(&(xo[WS(os, 6)]), VFNMSI(T30, T2Z), ovs, &(xo[0]));
Chris@82 268 }
Chris@82 269 {
Chris@82 270 V T2J, T2R, T2Q, T2S;
Chris@82 271 {
Chris@82 272 V T2B, T2I, T2M, T2P;
Chris@82 273 T2B = VFMA(LDK(KP707106781), T2A, T2x);
Chris@82 274 T2I = VADD(T2E, T2H);
Chris@82 275 T2J = VFNMS(LDK(KP923879532), T2I, T2B);
Chris@82 276 T2R = VFMA(LDK(KP923879532), T2I, T2B);
Chris@82 277 T2M = VFMA(LDK(KP707106781), T2L, T2K);
Chris@82 278 T2P = VSUB(T2N, T2O);
Chris@82 279 T2Q = VFNMS(LDK(KP923879532), T2P, T2M);
Chris@82 280 T2S = VFMA(LDK(KP923879532), T2P, T2M);
Chris@82 281 }
Chris@82 282 ST(&(xo[WS(os, 14)]), VFNMSI(T2Q, T2J), ovs, &(xo[0]));
Chris@82 283 ST(&(xo[WS(os, 2)]), VFMAI(T2S, T2R), ovs, &(xo[0]));
Chris@82 284 ST(&(xo[WS(os, 18)]), VFMAI(T2Q, T2J), ovs, &(xo[0]));
Chris@82 285 ST(&(xo[WS(os, 30)]), VFNMSI(T2S, T2R), ovs, &(xo[0]));
Chris@82 286 }
Chris@82 287 {
Chris@82 288 V T1r, T1C, T1M, T1J, T1F, T1K, T1y, T1N;
Chris@82 289 T1r = VFMA(LDK(KP923879532), T1q, T1p);
Chris@82 290 T1C = VFNMS(LDK(KP923879532), T1B, T1A);
Chris@82 291 T1M = VFMA(LDK(KP923879532), T1B, T1A);
Chris@82 292 T1J = VFNMS(LDK(KP923879532), T1q, T1p);
Chris@82 293 {
Chris@82 294 V T1D, T1E, T1u, T1x;
Chris@82 295 T1D = VFNMS(LDK(KP668178637), T1s, T1t);
Chris@82 296 T1E = VFNMS(LDK(KP668178637), T1v, T1w);
Chris@82 297 T1F = VSUB(T1D, T1E);
Chris@82 298 T1K = VADD(T1D, T1E);
Chris@82 299 T1u = VFMA(LDK(KP668178637), T1t, T1s);
Chris@82 300 T1x = VFMA(LDK(KP668178637), T1w, T1v);
Chris@82 301 T1y = VADD(T1u, T1x);
Chris@82 302 T1N = VSUB(T1u, T1x);
Chris@82 303 }
Chris@82 304 {
Chris@82 305 V T1z, T1G, T1P, T1Q;
Chris@82 306 T1z = VFNMS(LDK(KP831469612), T1y, T1r);
Chris@82 307 T1G = VFNMS(LDK(KP831469612), T1F, T1C);
Chris@82 308 ST(&(xo[WS(os, 19)]), VFNMSI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
Chris@82 309 ST(&(xo[WS(os, 13)]), VFMAI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
Chris@82 310 T1P = VFNMS(LDK(KP831469612), T1K, T1J);
Chris@82 311 T1Q = VFMA(LDK(KP831469612), T1N, T1M);
Chris@82 312 ST(&(xo[WS(os, 5)]), VFMAI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@82 313 ST(&(xo[WS(os, 27)]), VFNMSI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@82 314 }
Chris@82 315 {
Chris@82 316 V T1H, T1I, T1L, T1O;
Chris@82 317 T1H = VFMA(LDK(KP831469612), T1y, T1r);
Chris@82 318 T1I = VFMA(LDK(KP831469612), T1F, T1C);
Chris@82 319 ST(&(xo[WS(os, 3)]), VFNMSI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
Chris@82 320 ST(&(xo[WS(os, 29)]), VFMAI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
Chris@82 321 T1L = VFMA(LDK(KP831469612), T1K, T1J);
Chris@82 322 T1O = VFNMS(LDK(KP831469612), T1N, T1M);
Chris@82 323 ST(&(xo[WS(os, 11)]), VFNMSI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@82 324 ST(&(xo[WS(os, 21)]), VFMAI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@82 325 }
Chris@82 326 }
Chris@82 327 {
Chris@82 328 V Tr, T1a, T1k, T1h, T1d, T1i, T10, T1l;
Chris@82 329 Tr = VFMA(LDK(KP923879532), Tq, Tb);
Chris@82 330 T1a = VFMA(LDK(KP923879532), T19, T16);
Chris@82 331 T1k = VFNMS(LDK(KP923879532), T19, T16);
Chris@82 332 T1h = VFNMS(LDK(KP923879532), Tq, Tb);
Chris@82 333 {
Chris@82 334 V T1b, T1c, TI, TZ;
Chris@82 335 T1b = VFMA(LDK(KP198912367), TC, TH);
Chris@82 336 T1c = VFMA(LDK(KP198912367), TT, TY);
Chris@82 337 T1d = VSUB(T1b, T1c);
Chris@82 338 T1i = VADD(T1b, T1c);
Chris@82 339 TI = VFNMS(LDK(KP198912367), TH, TC);
Chris@82 340 TZ = VFNMS(LDK(KP198912367), TY, TT);
Chris@82 341 T10 = VADD(TI, TZ);
Chris@82 342 T1l = VSUB(TI, TZ);
Chris@82 343 }
Chris@82 344 {
Chris@82 345 V T11, T1e, T1n, T1o;
Chris@82 346 T11 = VFNMS(LDK(KP980785280), T10, Tr);
Chris@82 347 T1e = VFNMS(LDK(KP980785280), T1d, T1a);
Chris@82 348 ST(&(xo[WS(os, 15)]), VFNMSI(T1e, T11), ovs, &(xo[WS(os, 1)]));
Chris@82 349 ST(&(xo[WS(os, 17)]), VFMAI(T1e, T11), ovs, &(xo[WS(os, 1)]));
Chris@82 350 T1n = VFMA(LDK(KP980785280), T1i, T1h);
Chris@82 351 T1o = VFNMS(LDK(KP980785280), T1l, T1k);
Chris@82 352 ST(&(xo[WS(os, 7)]), VFNMSI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@82 353 ST(&(xo[WS(os, 25)]), VFMAI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@82 354 }
Chris@82 355 {
Chris@82 356 V T1f, T1g, T1j, T1m;
Chris@82 357 T1f = VFMA(LDK(KP980785280), T10, Tr);
Chris@82 358 T1g = VFMA(LDK(KP980785280), T1d, T1a);
Chris@82 359 ST(&(xo[WS(os, 31)]), VFNMSI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
Chris@82 360 ST(&(xo[WS(os, 1)]), VFMAI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
Chris@82 361 T1j = VFNMS(LDK(KP980785280), T1i, T1h);
Chris@82 362 T1m = VFMA(LDK(KP980785280), T1l, T1k);
Chris@82 363 ST(&(xo[WS(os, 9)]), VFMAI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@82 364 ST(&(xo[WS(os, 23)]), VFNMSI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@82 365 }
Chris@82 366 }
Chris@82 367 }
Chris@82 368 }
Chris@82 369 VLEAVE();
Chris@82 370 }
Chris@82 371
Chris@82 372 static const kdft_desc desc = { 32, XSIMD_STRING("n1bv_32"), {88, 0, 98, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 373
Chris@82 374 void XSIMD(codelet_n1bv_32) (planner *p) {
Chris@82 375 X(kdft_register) (p, n1bv_32, &desc);
Chris@82 376 }
Chris@82 377
Chris@82 378 #else
Chris@82 379
Chris@82 380 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 32 -name n1bv_32 -include dft/simd/n1b.h */
Chris@82 381
Chris@82 382 /*
Chris@82 383 * This function contains 186 FP additions, 42 FP multiplications,
Chris@82 384 * (or, 170 additions, 26 multiplications, 16 fused multiply/add),
Chris@82 385 * 58 stack variables, 7 constants, and 64 memory accesses
Chris@82 386 */
Chris@82 387 #include "dft/simd/n1b.h"
Chris@82 388
Chris@82 389 static void n1bv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 390 {
Chris@82 391 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 392 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 393 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 394 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 395 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 396 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 397 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 398 {
Chris@82 399 INT i;
Chris@82 400 const R *xi;
Chris@82 401 R *xo;
Chris@82 402 xi = ii;
Chris@82 403 xo = io;
Chris@82 404 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@82 405 V T2f, T2k, T2N, T2M, T19, T1B, Tb, T1p, TT, T1v, TY, T1w, T2E, T2F, T2G;
Chris@82 406 V T24, T2o, TC, T1s, TH, T1t, T2B, T2C, T2D, T1X, T2n, T2I, T2J, Tq, T1A;
Chris@82 407 V T14, T1q, T2c, T2l;
Chris@82 408 {
Chris@82 409 V T3, T2i, T18, T2j, T6, T2d, T9, T2e, T15, Ta;
Chris@82 410 {
Chris@82 411 V T1, T2, T16, T17;
Chris@82 412 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 413 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 414 T3 = VSUB(T1, T2);
Chris@82 415 T2i = VADD(T1, T2);
Chris@82 416 T16 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 417 T17 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 418 T18 = VSUB(T16, T17);
Chris@82 419 T2j = VADD(T16, T17);
Chris@82 420 }
Chris@82 421 {
Chris@82 422 V T4, T5, T7, T8;
Chris@82 423 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 424 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 425 T6 = VSUB(T4, T5);
Chris@82 426 T2d = VADD(T4, T5);
Chris@82 427 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 428 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 429 T9 = VSUB(T7, T8);
Chris@82 430 T2e = VADD(T7, T8);
Chris@82 431 }
Chris@82 432 T2f = VSUB(T2d, T2e);
Chris@82 433 T2k = VSUB(T2i, T2j);
Chris@82 434 T2N = VADD(T2d, T2e);
Chris@82 435 T2M = VADD(T2i, T2j);
Chris@82 436 T15 = VMUL(LDK(KP707106781), VSUB(T6, T9));
Chris@82 437 T19 = VSUB(T15, T18);
Chris@82 438 T1B = VADD(T18, T15);
Chris@82 439 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@82 440 Tb = VSUB(T3, Ta);
Chris@82 441 T1p = VADD(T3, Ta);
Chris@82 442 }
Chris@82 443 {
Chris@82 444 V TL, T21, TW, T1Y, TO, T22, TS, T1Z;
Chris@82 445 {
Chris@82 446 V TJ, TK, TU, TV;
Chris@82 447 TJ = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 448 TK = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 449 TL = VSUB(TJ, TK);
Chris@82 450 T21 = VADD(TJ, TK);
Chris@82 451 TU = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 452 TV = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 453 TW = VSUB(TU, TV);
Chris@82 454 T1Y = VADD(TU, TV);
Chris@82 455 }
Chris@82 456 {
Chris@82 457 V TM, TN, TQ, TR;
Chris@82 458 TM = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 459 TN = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 460 TO = VSUB(TM, TN);
Chris@82 461 T22 = VADD(TM, TN);
Chris@82 462 TQ = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 463 TR = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 464 TS = VSUB(TQ, TR);
Chris@82 465 T1Z = VADD(TQ, TR);
Chris@82 466 }
Chris@82 467 {
Chris@82 468 V TP, TX, T20, T23;
Chris@82 469 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
Chris@82 470 TT = VSUB(TP, TS);
Chris@82 471 T1v = VADD(TS, TP);
Chris@82 472 TX = VMUL(LDK(KP707106781), VADD(TL, TO));
Chris@82 473 TY = VSUB(TW, TX);
Chris@82 474 T1w = VADD(TW, TX);
Chris@82 475 T2E = VADD(T1Y, T1Z);
Chris@82 476 T2F = VADD(T21, T22);
Chris@82 477 T2G = VSUB(T2E, T2F);
Chris@82 478 T20 = VSUB(T1Y, T1Z);
Chris@82 479 T23 = VSUB(T21, T22);
Chris@82 480 T24 = VFMA(LDK(KP923879532), T20, VMUL(LDK(KP382683432), T23));
Chris@82 481 T2o = VFNMS(LDK(KP382683432), T20, VMUL(LDK(KP923879532), T23));
Chris@82 482 }
Chris@82 483 }
Chris@82 484 {
Chris@82 485 V Tu, T1U, TF, T1R, Tx, T1V, TB, T1S;
Chris@82 486 {
Chris@82 487 V Ts, Tt, TD, TE;
Chris@82 488 Ts = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 489 Tt = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 490 Tu = VSUB(Ts, Tt);
Chris@82 491 T1U = VADD(Ts, Tt);
Chris@82 492 TD = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 493 TE = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 494 TF = VSUB(TD, TE);
Chris@82 495 T1R = VADD(TD, TE);
Chris@82 496 }
Chris@82 497 {
Chris@82 498 V Tv, Tw, Tz, TA;
Chris@82 499 Tv = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 500 Tw = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 501 Tx = VSUB(Tv, Tw);
Chris@82 502 T1V = VADD(Tv, Tw);
Chris@82 503 Tz = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 504 TA = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 505 TB = VSUB(Tz, TA);
Chris@82 506 T1S = VADD(Tz, TA);
Chris@82 507 }
Chris@82 508 {
Chris@82 509 V Ty, TG, T1T, T1W;
Chris@82 510 Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
Chris@82 511 TC = VSUB(Ty, TB);
Chris@82 512 T1s = VADD(TB, Ty);
Chris@82 513 TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
Chris@82 514 TH = VSUB(TF, TG);
Chris@82 515 T1t = VADD(TF, TG);
Chris@82 516 T2B = VADD(T1R, T1S);
Chris@82 517 T2C = VADD(T1U, T1V);
Chris@82 518 T2D = VSUB(T2B, T2C);
Chris@82 519 T1T = VSUB(T1R, T1S);
Chris@82 520 T1W = VSUB(T1U, T1V);
Chris@82 521 T1X = VFNMS(LDK(KP382683432), T1W, VMUL(LDK(KP923879532), T1T));
Chris@82 522 T2n = VFMA(LDK(KP382683432), T1T, VMUL(LDK(KP923879532), T1W));
Chris@82 523 }
Chris@82 524 }
Chris@82 525 {
Chris@82 526 V Te, T26, To, T29, Th, T27, Tl, T2a, Ti, Tp;
Chris@82 527 {
Chris@82 528 V Tc, Td, Tm, Tn;
Chris@82 529 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 530 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 531 Te = VSUB(Tc, Td);
Chris@82 532 T26 = VADD(Tc, Td);
Chris@82 533 Tm = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 534 Tn = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 535 To = VSUB(Tm, Tn);
Chris@82 536 T29 = VADD(Tm, Tn);
Chris@82 537 }
Chris@82 538 {
Chris@82 539 V Tf, Tg, Tj, Tk;
Chris@82 540 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 541 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 542 Th = VSUB(Tf, Tg);
Chris@82 543 T27 = VADD(Tf, Tg);
Chris@82 544 Tj = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 545 Tk = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 546 Tl = VSUB(Tj, Tk);
Chris@82 547 T2a = VADD(Tj, Tk);
Chris@82 548 }
Chris@82 549 T2I = VADD(T26, T27);
Chris@82 550 T2J = VADD(T29, T2a);
Chris@82 551 Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@82 552 Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
Chris@82 553 Tq = VSUB(Ti, Tp);
Chris@82 554 T1A = VADD(Ti, Tp);
Chris@82 555 {
Chris@82 556 V T12, T13, T28, T2b;
Chris@82 557 T12 = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@82 558 T13 = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@82 559 T14 = VSUB(T12, T13);
Chris@82 560 T1q = VADD(T12, T13);
Chris@82 561 T28 = VSUB(T26, T27);
Chris@82 562 T2b = VSUB(T29, T2a);
Chris@82 563 T2c = VMUL(LDK(KP707106781), VSUB(T28, T2b));
Chris@82 564 T2l = VMUL(LDK(KP707106781), VADD(T28, T2b));
Chris@82 565 }
Chris@82 566 }
Chris@82 567 {
Chris@82 568 V T2L, T2R, T2Q, T2S;
Chris@82 569 {
Chris@82 570 V T2H, T2K, T2O, T2P;
Chris@82 571 T2H = VMUL(LDK(KP707106781), VSUB(T2D, T2G));
Chris@82 572 T2K = VSUB(T2I, T2J);
Chris@82 573 T2L = VBYI(VSUB(T2H, T2K));
Chris@82 574 T2R = VBYI(VADD(T2K, T2H));
Chris@82 575 T2O = VSUB(T2M, T2N);
Chris@82 576 T2P = VMUL(LDK(KP707106781), VADD(T2D, T2G));
Chris@82 577 T2Q = VSUB(T2O, T2P);
Chris@82 578 T2S = VADD(T2O, T2P);
Chris@82 579 }
Chris@82 580 ST(&(xo[WS(os, 12)]), VADD(T2L, T2Q), ovs, &(xo[0]));
Chris@82 581 ST(&(xo[WS(os, 28)]), VSUB(T2S, T2R), ovs, &(xo[0]));
Chris@82 582 ST(&(xo[WS(os, 20)]), VSUB(T2Q, T2L), ovs, &(xo[0]));
Chris@82 583 ST(&(xo[WS(os, 4)]), VADD(T2R, T2S), ovs, &(xo[0]));
Chris@82 584 }
Chris@82 585 {
Chris@82 586 V T2h, T2r, T2q, T2s;
Chris@82 587 {
Chris@82 588 V T25, T2g, T2m, T2p;
Chris@82 589 T25 = VSUB(T1X, T24);
Chris@82 590 T2g = VSUB(T2c, T2f);
Chris@82 591 T2h = VBYI(VSUB(T25, T2g));
Chris@82 592 T2r = VBYI(VADD(T2g, T25));
Chris@82 593 T2m = VSUB(T2k, T2l);
Chris@82 594 T2p = VSUB(T2n, T2o);
Chris@82 595 T2q = VSUB(T2m, T2p);
Chris@82 596 T2s = VADD(T2m, T2p);
Chris@82 597 }
Chris@82 598 ST(&(xo[WS(os, 10)]), VADD(T2h, T2q), ovs, &(xo[0]));
Chris@82 599 ST(&(xo[WS(os, 26)]), VSUB(T2s, T2r), ovs, &(xo[0]));
Chris@82 600 ST(&(xo[WS(os, 22)]), VSUB(T2q, T2h), ovs, &(xo[0]));
Chris@82 601 ST(&(xo[WS(os, 6)]), VADD(T2r, T2s), ovs, &(xo[0]));
Chris@82 602 }
Chris@82 603 {
Chris@82 604 V T2V, T2Z, T2Y, T30;
Chris@82 605 {
Chris@82 606 V T2T, T2U, T2W, T2X;
Chris@82 607 T2T = VADD(T2M, T2N);
Chris@82 608 T2U = VADD(T2I, T2J);
Chris@82 609 T2V = VSUB(T2T, T2U);
Chris@82 610 T2Z = VADD(T2T, T2U);
Chris@82 611 T2W = VADD(T2B, T2C);
Chris@82 612 T2X = VADD(T2E, T2F);
Chris@82 613 T2Y = VBYI(VSUB(T2W, T2X));
Chris@82 614 T30 = VADD(T2W, T2X);
Chris@82 615 }
Chris@82 616 ST(&(xo[WS(os, 24)]), VSUB(T2V, T2Y), ovs, &(xo[0]));
Chris@82 617 ST(&(xo[0]), VADD(T2Z, T30), ovs, &(xo[0]));
Chris@82 618 ST(&(xo[WS(os, 8)]), VADD(T2V, T2Y), ovs, &(xo[0]));
Chris@82 619 ST(&(xo[WS(os, 16)]), VSUB(T2Z, T30), ovs, &(xo[0]));
Chris@82 620 }
Chris@82 621 {
Chris@82 622 V T2v, T2z, T2y, T2A;
Chris@82 623 {
Chris@82 624 V T2t, T2u, T2w, T2x;
Chris@82 625 T2t = VADD(T2k, T2l);
Chris@82 626 T2u = VADD(T1X, T24);
Chris@82 627 T2v = VADD(T2t, T2u);
Chris@82 628 T2z = VSUB(T2t, T2u);
Chris@82 629 T2w = VADD(T2f, T2c);
Chris@82 630 T2x = VADD(T2n, T2o);
Chris@82 631 T2y = VBYI(VADD(T2w, T2x));
Chris@82 632 T2A = VBYI(VSUB(T2x, T2w));
Chris@82 633 }
Chris@82 634 ST(&(xo[WS(os, 30)]), VSUB(T2v, T2y), ovs, &(xo[0]));
Chris@82 635 ST(&(xo[WS(os, 14)]), VADD(T2z, T2A), ovs, &(xo[0]));
Chris@82 636 ST(&(xo[WS(os, 2)]), VADD(T2v, T2y), ovs, &(xo[0]));
Chris@82 637 ST(&(xo[WS(os, 18)]), VSUB(T2z, T2A), ovs, &(xo[0]));
Chris@82 638 }
Chris@82 639 {
Chris@82 640 V T1r, T1C, T1M, T1K, T1F, T1N, T1y, T1J;
Chris@82 641 T1r = VSUB(T1p, T1q);
Chris@82 642 T1C = VSUB(T1A, T1B);
Chris@82 643 T1M = VADD(T1p, T1q);
Chris@82 644 T1K = VADD(T1B, T1A);
Chris@82 645 {
Chris@82 646 V T1D, T1E, T1u, T1x;
Chris@82 647 T1D = VFNMS(LDK(KP195090322), T1s, VMUL(LDK(KP980785280), T1t));
Chris@82 648 T1E = VFMA(LDK(KP195090322), T1v, VMUL(LDK(KP980785280), T1w));
Chris@82 649 T1F = VSUB(T1D, T1E);
Chris@82 650 T1N = VADD(T1D, T1E);
Chris@82 651 T1u = VFMA(LDK(KP980785280), T1s, VMUL(LDK(KP195090322), T1t));
Chris@82 652 T1x = VFNMS(LDK(KP195090322), T1w, VMUL(LDK(KP980785280), T1v));
Chris@82 653 T1y = VSUB(T1u, T1x);
Chris@82 654 T1J = VADD(T1u, T1x);
Chris@82 655 }
Chris@82 656 {
Chris@82 657 V T1z, T1G, T1P, T1Q;
Chris@82 658 T1z = VADD(T1r, T1y);
Chris@82 659 T1G = VBYI(VADD(T1C, T1F));
Chris@82 660 ST(&(xo[WS(os, 25)]), VSUB(T1z, T1G), ovs, &(xo[WS(os, 1)]));
Chris@82 661 ST(&(xo[WS(os, 7)]), VADD(T1z, T1G), ovs, &(xo[WS(os, 1)]));
Chris@82 662 T1P = VBYI(VADD(T1K, T1J));
Chris@82 663 T1Q = VADD(T1M, T1N);
Chris@82 664 ST(&(xo[WS(os, 1)]), VADD(T1P, T1Q), ovs, &(xo[WS(os, 1)]));
Chris@82 665 ST(&(xo[WS(os, 31)]), VSUB(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@82 666 }
Chris@82 667 {
Chris@82 668 V T1H, T1I, T1L, T1O;
Chris@82 669 T1H = VSUB(T1r, T1y);
Chris@82 670 T1I = VBYI(VSUB(T1F, T1C));
Chris@82 671 ST(&(xo[WS(os, 23)]), VSUB(T1H, T1I), ovs, &(xo[WS(os, 1)]));
Chris@82 672 ST(&(xo[WS(os, 9)]), VADD(T1H, T1I), ovs, &(xo[WS(os, 1)]));
Chris@82 673 T1L = VBYI(VSUB(T1J, T1K));
Chris@82 674 T1O = VSUB(T1M, T1N);
Chris@82 675 ST(&(xo[WS(os, 15)]), VADD(T1L, T1O), ovs, &(xo[WS(os, 1)]));
Chris@82 676 ST(&(xo[WS(os, 17)]), VSUB(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@82 677 }
Chris@82 678 }
Chris@82 679 {
Chris@82 680 V Tr, T1a, T1k, T1i, T1d, T1l, T10, T1h;
Chris@82 681 Tr = VSUB(Tb, Tq);
Chris@82 682 T1a = VSUB(T14, T19);
Chris@82 683 T1k = VADD(Tb, Tq);
Chris@82 684 T1i = VADD(T19, T14);
Chris@82 685 {
Chris@82 686 V T1b, T1c, TI, TZ;
Chris@82 687 T1b = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
Chris@82 688 T1c = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
Chris@82 689 T1d = VSUB(T1b, T1c);
Chris@82 690 T1l = VADD(T1b, T1c);
Chris@82 691 TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
Chris@82 692 TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
Chris@82 693 T10 = VSUB(TI, TZ);
Chris@82 694 T1h = VADD(TI, TZ);
Chris@82 695 }
Chris@82 696 {
Chris@82 697 V T11, T1e, T1n, T1o;
Chris@82 698 T11 = VADD(Tr, T10);
Chris@82 699 T1e = VBYI(VADD(T1a, T1d));
Chris@82 700 ST(&(xo[WS(os, 27)]), VSUB(T11, T1e), ovs, &(xo[WS(os, 1)]));
Chris@82 701 ST(&(xo[WS(os, 5)]), VADD(T11, T1e), ovs, &(xo[WS(os, 1)]));
Chris@82 702 T1n = VBYI(VADD(T1i, T1h));
Chris@82 703 T1o = VADD(T1k, T1l);
Chris@82 704 ST(&(xo[WS(os, 3)]), VADD(T1n, T1o), ovs, &(xo[WS(os, 1)]));
Chris@82 705 ST(&(xo[WS(os, 29)]), VSUB(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@82 706 }
Chris@82 707 {
Chris@82 708 V T1f, T1g, T1j, T1m;
Chris@82 709 T1f = VSUB(Tr, T10);
Chris@82 710 T1g = VBYI(VSUB(T1d, T1a));
Chris@82 711 ST(&(xo[WS(os, 21)]), VSUB(T1f, T1g), ovs, &(xo[WS(os, 1)]));
Chris@82 712 ST(&(xo[WS(os, 11)]), VADD(T1f, T1g), ovs, &(xo[WS(os, 1)]));
Chris@82 713 T1j = VBYI(VSUB(T1h, T1i));
Chris@82 714 T1m = VSUB(T1k, T1l);
Chris@82 715 ST(&(xo[WS(os, 13)]), VADD(T1j, T1m), ovs, &(xo[WS(os, 1)]));
Chris@82 716 ST(&(xo[WS(os, 19)]), VSUB(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@82 717 }
Chris@82 718 }
Chris@82 719 }
Chris@82 720 }
Chris@82 721 VLEAVE();
Chris@82 722 }
Chris@82 723
Chris@82 724 static const kdft_desc desc = { 32, XSIMD_STRING("n1bv_32"), {170, 26, 16, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 725
Chris@82 726 void XSIMD(codelet_n1bv_32) (planner *p) {
Chris@82 727 X(kdft_register) (p, n1bv_32, &desc);
Chris@82 728 }
Chris@82 729
Chris@82 730 #endif