annotate src/fftw-3.3.5/dft/simd/common/n1bv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:39:22 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 32 -name n1bv_32 -include n1b.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 186 FP additions, 98 FP multiplications,
Chris@42 32 * (or, 88 additions, 0 multiplications, 98 fused multiply/add),
Chris@42 33 * 104 stack variables, 7 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "n1b.h"
Chris@42 36
Chris@42 37 static void n1bv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 40 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 41 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 42 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 44 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 46 {
Chris@42 47 INT i;
Chris@42 48 const R *xi;
Chris@42 49 R *xo;
Chris@42 50 xi = ii;
Chris@42 51 xo = io;
Chris@42 52 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@42 53 V T1h, Tr, T1a, T1k, TI, T1b, T1L, T1P, T1I, T1G, T1O, T1Q, T1H, T1z, T1c;
Chris@42 54 V TZ;
Chris@42 55 {
Chris@42 56 V T2x, T1T, T2K, T1W, T1p, Tb, T1A, T16, Tu, TF, T2O, T2H, T2b, T2t, TY;
Chris@42 57 V T1w, TT, T1v, T20, T2C, Tj, Te, T2e, To, T2i, T23, T2D, TB, TG, Th;
Chris@42 58 V T2f, Tk;
Chris@42 59 {
Chris@42 60 V TL, TW, TP, TQ, T2F, T27, T28, TO;
Chris@42 61 {
Chris@42 62 V T1, T2, T12, T13, T4, T5, T7, T8;
Chris@42 63 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 64 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 65 T12 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 66 T13 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@42 67 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 68 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@42 69 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@42 70 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 71 {
Chris@42 72 V TM, T25, T26, TN;
Chris@42 73 {
Chris@42 74 V TJ, T3, T14, T1U, T6, T1V, T9, TK, TU, TV, T1R, T1S, Ta, T15;
Chris@42 75 TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@42 76 T1R = VADD(T1, T2);
Chris@42 77 T3 = VSUB(T1, T2);
Chris@42 78 T1S = VADD(T12, T13);
Chris@42 79 T14 = VSUB(T12, T13);
Chris@42 80 T1U = VADD(T4, T5);
Chris@42 81 T6 = VSUB(T4, T5);
Chris@42 82 T1V = VADD(T7, T8);
Chris@42 83 T9 = VSUB(T7, T8);
Chris@42 84 TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 85 TU = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@42 86 T2x = VSUB(T1R, T1S);
Chris@42 87 T1T = VADD(T1R, T1S);
Chris@42 88 TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 89 TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 90 T2K = VSUB(T1U, T1V);
Chris@42 91 T1W = VADD(T1U, T1V);
Chris@42 92 Ta = VADD(T6, T9);
Chris@42 93 T15 = VSUB(T6, T9);
Chris@42 94 T25 = VADD(TJ, TK);
Chris@42 95 TL = VSUB(TJ, TK);
Chris@42 96 T26 = VADD(TV, TU);
Chris@42 97 TW = VSUB(TU, TV);
Chris@42 98 TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 99 TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@42 100 T1p = VFNMS(LDK(KP707106781), Ta, T3);
Chris@42 101 Tb = VFMA(LDK(KP707106781), Ta, T3);
Chris@42 102 T1A = VFNMS(LDK(KP707106781), T15, T14);
Chris@42 103 T16 = VFMA(LDK(KP707106781), T15, T14);
Chris@42 104 TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 105 }
Chris@42 106 T2F = VSUB(T25, T26);
Chris@42 107 T27 = VADD(T25, T26);
Chris@42 108 T28 = VADD(TM, TN);
Chris@42 109 TO = VSUB(TM, TN);
Chris@42 110 }
Chris@42 111 }
Chris@42 112 {
Chris@42 113 V Ty, T21, Tx, Tz, T1Y, T1Z;
Chris@42 114 {
Chris@42 115 V Ts, Tt, TD, T29, TR, TE, Tv, Tw;
Chris@42 116 Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 117 Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 118 TD = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 119 T29 = VADD(TP, TQ);
Chris@42 120 TR = VSUB(TP, TQ);
Chris@42 121 TE = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@42 122 Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 123 Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@42 124 Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@42 125 T1Y = VADD(Ts, Tt);
Chris@42 126 Tu = VSUB(Ts, Tt);
Chris@42 127 {
Chris@42 128 V T2G, T2a, TX, TS;
Chris@42 129 T2G = VSUB(T29, T28);
Chris@42 130 T2a = VADD(T28, T29);
Chris@42 131 TX = VSUB(TR, TO);
Chris@42 132 TS = VADD(TO, TR);
Chris@42 133 T1Z = VADD(TD, TE);
Chris@42 134 TF = VSUB(TD, TE);
Chris@42 135 T21 = VADD(Tv, Tw);
Chris@42 136 Tx = VSUB(Tv, Tw);
Chris@42 137 T2O = VFMA(LDK(KP414213562), T2F, T2G);
Chris@42 138 T2H = VFNMS(LDK(KP414213562), T2G, T2F);
Chris@42 139 T2b = VSUB(T27, T2a);
Chris@42 140 T2t = VADD(T27, T2a);
Chris@42 141 TY = VFMA(LDK(KP707106781), TX, TW);
Chris@42 142 T1w = VFNMS(LDK(KP707106781), TX, TW);
Chris@42 143 TT = VFMA(LDK(KP707106781), TS, TL);
Chris@42 144 T1v = VFNMS(LDK(KP707106781), TS, TL);
Chris@42 145 Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 146 }
Chris@42 147 }
Chris@42 148 T20 = VADD(T1Y, T1Z);
Chris@42 149 T2C = VSUB(T1Y, T1Z);
Chris@42 150 {
Chris@42 151 V Tc, Td, Tm, Tn;
Chris@42 152 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 153 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 154 Tm = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@42 155 Tn = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 156 {
Chris@42 157 V Tf, TA, T22, Tg;
Chris@42 158 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 159 TA = VSUB(Ty, Tz);
Chris@42 160 T22 = VADD(Ty, Tz);
Chris@42 161 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@42 162 Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@42 163 Te = VSUB(Tc, Td);
Chris@42 164 T2e = VADD(Tc, Td);
Chris@42 165 To = VSUB(Tm, Tn);
Chris@42 166 T2i = VADD(Tn, Tm);
Chris@42 167 T23 = VADD(T21, T22);
Chris@42 168 T2D = VSUB(T21, T22);
Chris@42 169 TB = VADD(Tx, TA);
Chris@42 170 TG = VSUB(Tx, TA);
Chris@42 171 Th = VSUB(Tf, Tg);
Chris@42 172 T2f = VADD(Tf, Tg);
Chris@42 173 Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 174 }
Chris@42 175 }
Chris@42 176 }
Chris@42 177 }
Chris@42 178 {
Chris@42 179 V T1t, TH, T1s, TC, T2P, T2U, T2n, T2d, T2w, T2u, T1q, T19, T1B, Tq, T2W;
Chris@42 180 V T2M, T2B, T2T, T2v, T2r, T2o, T2m, T2X, T2I;
Chris@42 181 {
Chris@42 182 V T1X, T2p, T2E, T2N, T2s, T2y, T2g, T17, Ti, T2h, Tl, T2c, T2l, T24;
Chris@42 183 T1X = VSUB(T1T, T1W);
Chris@42 184 T2p = VADD(T1T, T1W);
Chris@42 185 T2E = VFNMS(LDK(KP414213562), T2D, T2C);
Chris@42 186 T2N = VFMA(LDK(KP414213562), T2C, T2D);
Chris@42 187 T2s = VADD(T20, T23);
Chris@42 188 T24 = VSUB(T20, T23);
Chris@42 189 T1t = VFNMS(LDK(KP707106781), TG, TF);
Chris@42 190 TH = VFMA(LDK(KP707106781), TG, TF);
Chris@42 191 T1s = VFNMS(LDK(KP707106781), TB, Tu);
Chris@42 192 TC = VFMA(LDK(KP707106781), TB, Tu);
Chris@42 193 T2y = VSUB(T2e, T2f);
Chris@42 194 T2g = VADD(T2e, T2f);
Chris@42 195 T17 = VFMA(LDK(KP414213562), Te, Th);
Chris@42 196 Ti = VFNMS(LDK(KP414213562), Th, Te);
Chris@42 197 T2h = VADD(Tj, Tk);
Chris@42 198 Tl = VSUB(Tj, Tk);
Chris@42 199 T2c = VADD(T24, T2b);
Chris@42 200 T2l = VSUB(T24, T2b);
Chris@42 201 {
Chris@42 202 V T2L, T2A, T2q, T2k;
Chris@42 203 T2P = VSUB(T2N, T2O);
Chris@42 204 T2U = VADD(T2N, T2O);
Chris@42 205 {
Chris@42 206 V T2z, T2j, T18, Tp;
Chris@42 207 T2z = VSUB(T2h, T2i);
Chris@42 208 T2j = VADD(T2h, T2i);
Chris@42 209 T18 = VFMA(LDK(KP414213562), Tl, To);
Chris@42 210 Tp = VFNMS(LDK(KP414213562), To, Tl);
Chris@42 211 T2n = VFMA(LDK(KP707106781), T2c, T1X);
Chris@42 212 T2d = VFNMS(LDK(KP707106781), T2c, T1X);
Chris@42 213 T2w = VADD(T2s, T2t);
Chris@42 214 T2u = VSUB(T2s, T2t);
Chris@42 215 T2L = VSUB(T2y, T2z);
Chris@42 216 T2A = VADD(T2y, T2z);
Chris@42 217 T2q = VADD(T2g, T2j);
Chris@42 218 T2k = VSUB(T2g, T2j);
Chris@42 219 T1q = VADD(T17, T18);
Chris@42 220 T19 = VSUB(T17, T18);
Chris@42 221 T1B = VSUB(Ti, Tp);
Chris@42 222 Tq = VADD(Ti, Tp);
Chris@42 223 }
Chris@42 224 T2W = VFNMS(LDK(KP707106781), T2L, T2K);
Chris@42 225 T2M = VFMA(LDK(KP707106781), T2L, T2K);
Chris@42 226 T2B = VFMA(LDK(KP707106781), T2A, T2x);
Chris@42 227 T2T = VFNMS(LDK(KP707106781), T2A, T2x);
Chris@42 228 T2v = VADD(T2p, T2q);
Chris@42 229 T2r = VSUB(T2p, T2q);
Chris@42 230 T2o = VFMA(LDK(KP707106781), T2l, T2k);
Chris@42 231 T2m = VFNMS(LDK(KP707106781), T2l, T2k);
Chris@42 232 T2X = VSUB(T2E, T2H);
Chris@42 233 T2I = VADD(T2E, T2H);
Chris@42 234 }
Chris@42 235 }
Chris@42 236 {
Chris@42 237 V T2V, T2Z, T2Y, T30, T2R, T2J;
Chris@42 238 T2V = VFNMS(LDK(KP923879532), T2U, T2T);
Chris@42 239 T2Z = VFMA(LDK(KP923879532), T2U, T2T);
Chris@42 240 ST(&(xo[WS(os, 16)]), VSUB(T2v, T2w), ovs, &(xo[0]));
Chris@42 241 ST(&(xo[0]), VADD(T2v, T2w), ovs, &(xo[0]));
Chris@42 242 ST(&(xo[WS(os, 8)]), VFMAI(T2u, T2r), ovs, &(xo[0]));
Chris@42 243 ST(&(xo[WS(os, 24)]), VFNMSI(T2u, T2r), ovs, &(xo[0]));
Chris@42 244 ST(&(xo[WS(os, 4)]), VFMAI(T2o, T2n), ovs, &(xo[0]));
Chris@42 245 ST(&(xo[WS(os, 28)]), VFNMSI(T2o, T2n), ovs, &(xo[0]));
Chris@42 246 ST(&(xo[WS(os, 20)]), VFMAI(T2m, T2d), ovs, &(xo[0]));
Chris@42 247 ST(&(xo[WS(os, 12)]), VFNMSI(T2m, T2d), ovs, &(xo[0]));
Chris@42 248 T2Y = VFMA(LDK(KP923879532), T2X, T2W);
Chris@42 249 T30 = VFNMS(LDK(KP923879532), T2X, T2W);
Chris@42 250 T2R = VFMA(LDK(KP923879532), T2I, T2B);
Chris@42 251 T2J = VFNMS(LDK(KP923879532), T2I, T2B);
Chris@42 252 {
Chris@42 253 V T1J, T1r, T1C, T1M, T2S, T2Q, T1u, T1D, T1E, T1x;
Chris@42 254 T1J = VFNMS(LDK(KP923879532), T1q, T1p);
Chris@42 255 T1r = VFMA(LDK(KP923879532), T1q, T1p);
Chris@42 256 T1C = VFNMS(LDK(KP923879532), T1B, T1A);
Chris@42 257 T1M = VFMA(LDK(KP923879532), T1B, T1A);
Chris@42 258 ST(&(xo[WS(os, 6)]), VFNMSI(T30, T2Z), ovs, &(xo[0]));
Chris@42 259 ST(&(xo[WS(os, 26)]), VFMAI(T30, T2Z), ovs, &(xo[0]));
Chris@42 260 ST(&(xo[WS(os, 22)]), VFNMSI(T2Y, T2V), ovs, &(xo[0]));
Chris@42 261 ST(&(xo[WS(os, 10)]), VFMAI(T2Y, T2V), ovs, &(xo[0]));
Chris@42 262 T2S = VFMA(LDK(KP923879532), T2P, T2M);
Chris@42 263 T2Q = VFNMS(LDK(KP923879532), T2P, T2M);
Chris@42 264 T1u = VFMA(LDK(KP668178637), T1t, T1s);
Chris@42 265 T1D = VFNMS(LDK(KP668178637), T1s, T1t);
Chris@42 266 T1E = VFNMS(LDK(KP668178637), T1v, T1w);
Chris@42 267 T1x = VFMA(LDK(KP668178637), T1w, T1v);
Chris@42 268 {
Chris@42 269 V T1K, T1F, T1N, T1y;
Chris@42 270 T1h = VFNMS(LDK(KP923879532), Tq, Tb);
Chris@42 271 Tr = VFMA(LDK(KP923879532), Tq, Tb);
Chris@42 272 ST(&(xo[WS(os, 30)]), VFNMSI(T2S, T2R), ovs, &(xo[0]));
Chris@42 273 ST(&(xo[WS(os, 2)]), VFMAI(T2S, T2R), ovs, &(xo[0]));
Chris@42 274 ST(&(xo[WS(os, 18)]), VFMAI(T2Q, T2J), ovs, &(xo[0]));
Chris@42 275 ST(&(xo[WS(os, 14)]), VFNMSI(T2Q, T2J), ovs, &(xo[0]));
Chris@42 276 T1K = VADD(T1D, T1E);
Chris@42 277 T1F = VSUB(T1D, T1E);
Chris@42 278 T1N = VSUB(T1u, T1x);
Chris@42 279 T1y = VADD(T1u, T1x);
Chris@42 280 T1a = VFMA(LDK(KP923879532), T19, T16);
Chris@42 281 T1k = VFNMS(LDK(KP923879532), T19, T16);
Chris@42 282 TI = VFNMS(LDK(KP198912367), TH, TC);
Chris@42 283 T1b = VFMA(LDK(KP198912367), TC, TH);
Chris@42 284 T1L = VFMA(LDK(KP831469612), T1K, T1J);
Chris@42 285 T1P = VFNMS(LDK(KP831469612), T1K, T1J);
Chris@42 286 T1I = VFMA(LDK(KP831469612), T1F, T1C);
Chris@42 287 T1G = VFNMS(LDK(KP831469612), T1F, T1C);
Chris@42 288 T1O = VFNMS(LDK(KP831469612), T1N, T1M);
Chris@42 289 T1Q = VFMA(LDK(KP831469612), T1N, T1M);
Chris@42 290 T1H = VFMA(LDK(KP831469612), T1y, T1r);
Chris@42 291 T1z = VFNMS(LDK(KP831469612), T1y, T1r);
Chris@42 292 T1c = VFMA(LDK(KP198912367), TT, TY);
Chris@42 293 TZ = VFNMS(LDK(KP198912367), TY, TT);
Chris@42 294 }
Chris@42 295 }
Chris@42 296 }
Chris@42 297 }
Chris@42 298 }
Chris@42 299 {
Chris@42 300 V T1d, T1i, T10, T1l;
Chris@42 301 ST(&(xo[WS(os, 21)]), VFMAI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@42 302 ST(&(xo[WS(os, 11)]), VFNMSI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@42 303 ST(&(xo[WS(os, 27)]), VFNMSI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@42 304 ST(&(xo[WS(os, 5)]), VFMAI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@42 305 ST(&(xo[WS(os, 29)]), VFMAI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
Chris@42 306 ST(&(xo[WS(os, 3)]), VFNMSI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
Chris@42 307 ST(&(xo[WS(os, 13)]), VFMAI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
Chris@42 308 ST(&(xo[WS(os, 19)]), VFNMSI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
Chris@42 309 T1d = VSUB(T1b, T1c);
Chris@42 310 T1i = VADD(T1b, T1c);
Chris@42 311 T10 = VADD(TI, TZ);
Chris@42 312 T1l = VSUB(TI, TZ);
Chris@42 313 {
Chris@42 314 V T1n, T1j, T1e, T1g, T1o, T1m, T11, T1f;
Chris@42 315 T1n = VFMA(LDK(KP980785280), T1i, T1h);
Chris@42 316 T1j = VFNMS(LDK(KP980785280), T1i, T1h);
Chris@42 317 T1e = VFNMS(LDK(KP980785280), T1d, T1a);
Chris@42 318 T1g = VFMA(LDK(KP980785280), T1d, T1a);
Chris@42 319 T1o = VFNMS(LDK(KP980785280), T1l, T1k);
Chris@42 320 T1m = VFMA(LDK(KP980785280), T1l, T1k);
Chris@42 321 T11 = VFNMS(LDK(KP980785280), T10, Tr);
Chris@42 322 T1f = VFMA(LDK(KP980785280), T10, Tr);
Chris@42 323 ST(&(xo[WS(os, 23)]), VFNMSI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@42 324 ST(&(xo[WS(os, 9)]), VFMAI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@42 325 ST(&(xo[WS(os, 25)]), VFMAI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@42 326 ST(&(xo[WS(os, 7)]), VFNMSI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@42 327 ST(&(xo[WS(os, 1)]), VFMAI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
Chris@42 328 ST(&(xo[WS(os, 31)]), VFNMSI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
Chris@42 329 ST(&(xo[WS(os, 17)]), VFMAI(T1e, T11), ovs, &(xo[WS(os, 1)]));
Chris@42 330 ST(&(xo[WS(os, 15)]), VFNMSI(T1e, T11), ovs, &(xo[WS(os, 1)]));
Chris@42 331 }
Chris@42 332 }
Chris@42 333 }
Chris@42 334 }
Chris@42 335 VLEAVE();
Chris@42 336 }
Chris@42 337
Chris@42 338 static const kdft_desc desc = { 32, XSIMD_STRING("n1bv_32"), {88, 0, 98, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 339
Chris@42 340 void XSIMD(codelet_n1bv_32) (planner *p) {
Chris@42 341 X(kdft_register) (p, n1bv_32, &desc);
Chris@42 342 }
Chris@42 343
Chris@42 344 #else /* HAVE_FMA */
Chris@42 345
Chris@42 346 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 32 -name n1bv_32 -include n1b.h */
Chris@42 347
Chris@42 348 /*
Chris@42 349 * This function contains 186 FP additions, 42 FP multiplications,
Chris@42 350 * (or, 170 additions, 26 multiplications, 16 fused multiply/add),
Chris@42 351 * 58 stack variables, 7 constants, and 64 memory accesses
Chris@42 352 */
Chris@42 353 #include "n1b.h"
Chris@42 354
Chris@42 355 static void n1bv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 356 {
Chris@42 357 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@42 358 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 359 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@42 360 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 361 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 362 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 363 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 364 {
Chris@42 365 INT i;
Chris@42 366 const R *xi;
Chris@42 367 R *xo;
Chris@42 368 xi = ii;
Chris@42 369 xo = io;
Chris@42 370 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@42 371 V T2f, T2k, T2N, T2M, T19, T1B, Tb, T1p, TT, T1v, TY, T1w, T2E, T2F, T2G;
Chris@42 372 V T24, T2o, TC, T1s, TH, T1t, T2B, T2C, T2D, T1X, T2n, T2I, T2J, Tq, T1A;
Chris@42 373 V T14, T1q, T2c, T2l;
Chris@42 374 {
Chris@42 375 V T3, T2i, T18, T2j, T6, T2d, T9, T2e, T15, Ta;
Chris@42 376 {
Chris@42 377 V T1, T2, T16, T17;
Chris@42 378 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 379 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 380 T3 = VSUB(T1, T2);
Chris@42 381 T2i = VADD(T1, T2);
Chris@42 382 T16 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 383 T17 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@42 384 T18 = VSUB(T16, T17);
Chris@42 385 T2j = VADD(T16, T17);
Chris@42 386 }
Chris@42 387 {
Chris@42 388 V T4, T5, T7, T8;
Chris@42 389 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 390 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@42 391 T6 = VSUB(T4, T5);
Chris@42 392 T2d = VADD(T4, T5);
Chris@42 393 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@42 394 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 395 T9 = VSUB(T7, T8);
Chris@42 396 T2e = VADD(T7, T8);
Chris@42 397 }
Chris@42 398 T2f = VSUB(T2d, T2e);
Chris@42 399 T2k = VSUB(T2i, T2j);
Chris@42 400 T2N = VADD(T2d, T2e);
Chris@42 401 T2M = VADD(T2i, T2j);
Chris@42 402 T15 = VMUL(LDK(KP707106781), VSUB(T6, T9));
Chris@42 403 T19 = VSUB(T15, T18);
Chris@42 404 T1B = VADD(T18, T15);
Chris@42 405 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@42 406 Tb = VSUB(T3, Ta);
Chris@42 407 T1p = VADD(T3, Ta);
Chris@42 408 }
Chris@42 409 {
Chris@42 410 V TL, T21, TW, T1Y, TO, T22, TS, T1Z;
Chris@42 411 {
Chris@42 412 V TJ, TK, TU, TV;
Chris@42 413 TJ = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 414 TK = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 415 TL = VSUB(TJ, TK);
Chris@42 416 T21 = VADD(TJ, TK);
Chris@42 417 TU = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@42 418 TV = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 419 TW = VSUB(TU, TV);
Chris@42 420 T1Y = VADD(TU, TV);
Chris@42 421 }
Chris@42 422 {
Chris@42 423 V TM, TN, TQ, TR;
Chris@42 424 TM = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@42 425 TN = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 426 TO = VSUB(TM, TN);
Chris@42 427 T22 = VADD(TM, TN);
Chris@42 428 TQ = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 429 TR = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@42 430 TS = VSUB(TQ, TR);
Chris@42 431 T1Z = VADD(TQ, TR);
Chris@42 432 }
Chris@42 433 {
Chris@42 434 V TP, TX, T20, T23;
Chris@42 435 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
Chris@42 436 TT = VSUB(TP, TS);
Chris@42 437 T1v = VADD(TS, TP);
Chris@42 438 TX = VMUL(LDK(KP707106781), VADD(TL, TO));
Chris@42 439 TY = VSUB(TW, TX);
Chris@42 440 T1w = VADD(TW, TX);
Chris@42 441 T2E = VADD(T1Y, T1Z);
Chris@42 442 T2F = VADD(T21, T22);
Chris@42 443 T2G = VSUB(T2E, T2F);
Chris@42 444 T20 = VSUB(T1Y, T1Z);
Chris@42 445 T23 = VSUB(T21, T22);
Chris@42 446 T24 = VFMA(LDK(KP923879532), T20, VMUL(LDK(KP382683432), T23));
Chris@42 447 T2o = VFNMS(LDK(KP382683432), T20, VMUL(LDK(KP923879532), T23));
Chris@42 448 }
Chris@42 449 }
Chris@42 450 {
Chris@42 451 V Tu, T1U, TF, T1R, Tx, T1V, TB, T1S;
Chris@42 452 {
Chris@42 453 V Ts, Tt, TD, TE;
Chris@42 454 Ts = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 455 Tt = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@42 456 Tu = VSUB(Ts, Tt);
Chris@42 457 T1U = VADD(Ts, Tt);
Chris@42 458 TD = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 459 TE = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 460 TF = VSUB(TD, TE);
Chris@42 461 T1R = VADD(TD, TE);
Chris@42 462 }
Chris@42 463 {
Chris@42 464 V Tv, Tw, Tz, TA;
Chris@42 465 Tv = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@42 466 Tw = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 467 Tx = VSUB(Tv, Tw);
Chris@42 468 T1V = VADD(Tv, Tw);
Chris@42 469 Tz = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 470 TA = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@42 471 TB = VSUB(Tz, TA);
Chris@42 472 T1S = VADD(Tz, TA);
Chris@42 473 }
Chris@42 474 {
Chris@42 475 V Ty, TG, T1T, T1W;
Chris@42 476 Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
Chris@42 477 TC = VSUB(Ty, TB);
Chris@42 478 T1s = VADD(TB, Ty);
Chris@42 479 TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
Chris@42 480 TH = VSUB(TF, TG);
Chris@42 481 T1t = VADD(TF, TG);
Chris@42 482 T2B = VADD(T1R, T1S);
Chris@42 483 T2C = VADD(T1U, T1V);
Chris@42 484 T2D = VSUB(T2B, T2C);
Chris@42 485 T1T = VSUB(T1R, T1S);
Chris@42 486 T1W = VSUB(T1U, T1V);
Chris@42 487 T1X = VFNMS(LDK(KP382683432), T1W, VMUL(LDK(KP923879532), T1T));
Chris@42 488 T2n = VFMA(LDK(KP382683432), T1T, VMUL(LDK(KP923879532), T1W));
Chris@42 489 }
Chris@42 490 }
Chris@42 491 {
Chris@42 492 V Te, T26, To, T29, Th, T27, Tl, T2a, Ti, Tp;
Chris@42 493 {
Chris@42 494 V Tc, Td, Tm, Tn;
Chris@42 495 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 496 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 497 Te = VSUB(Tc, Td);
Chris@42 498 T26 = VADD(Tc, Td);
Chris@42 499 Tm = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@42 500 Tn = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 501 To = VSUB(Tm, Tn);
Chris@42 502 T29 = VADD(Tm, Tn);
Chris@42 503 }
Chris@42 504 {
Chris@42 505 V Tf, Tg, Tj, Tk;
Chris@42 506 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 507 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@42 508 Th = VSUB(Tf, Tg);
Chris@42 509 T27 = VADD(Tf, Tg);
Chris@42 510 Tj = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 511 Tk = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@42 512 Tl = VSUB(Tj, Tk);
Chris@42 513 T2a = VADD(Tj, Tk);
Chris@42 514 }
Chris@42 515 T2I = VADD(T26, T27);
Chris@42 516 T2J = VADD(T29, T2a);
Chris@42 517 Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@42 518 Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
Chris@42 519 Tq = VSUB(Ti, Tp);
Chris@42 520 T1A = VADD(Ti, Tp);
Chris@42 521 {
Chris@42 522 V T12, T13, T28, T2b;
Chris@42 523 T12 = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@42 524 T13 = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@42 525 T14 = VSUB(T12, T13);
Chris@42 526 T1q = VADD(T12, T13);
Chris@42 527 T28 = VSUB(T26, T27);
Chris@42 528 T2b = VSUB(T29, T2a);
Chris@42 529 T2c = VMUL(LDK(KP707106781), VSUB(T28, T2b));
Chris@42 530 T2l = VMUL(LDK(KP707106781), VADD(T28, T2b));
Chris@42 531 }
Chris@42 532 }
Chris@42 533 {
Chris@42 534 V T2L, T2R, T2Q, T2S;
Chris@42 535 {
Chris@42 536 V T2H, T2K, T2O, T2P;
Chris@42 537 T2H = VMUL(LDK(KP707106781), VSUB(T2D, T2G));
Chris@42 538 T2K = VSUB(T2I, T2J);
Chris@42 539 T2L = VBYI(VSUB(T2H, T2K));
Chris@42 540 T2R = VBYI(VADD(T2K, T2H));
Chris@42 541 T2O = VSUB(T2M, T2N);
Chris@42 542 T2P = VMUL(LDK(KP707106781), VADD(T2D, T2G));
Chris@42 543 T2Q = VSUB(T2O, T2P);
Chris@42 544 T2S = VADD(T2O, T2P);
Chris@42 545 }
Chris@42 546 ST(&(xo[WS(os, 12)]), VADD(T2L, T2Q), ovs, &(xo[0]));
Chris@42 547 ST(&(xo[WS(os, 28)]), VSUB(T2S, T2R), ovs, &(xo[0]));
Chris@42 548 ST(&(xo[WS(os, 20)]), VSUB(T2Q, T2L), ovs, &(xo[0]));
Chris@42 549 ST(&(xo[WS(os, 4)]), VADD(T2R, T2S), ovs, &(xo[0]));
Chris@42 550 }
Chris@42 551 {
Chris@42 552 V T2h, T2r, T2q, T2s;
Chris@42 553 {
Chris@42 554 V T25, T2g, T2m, T2p;
Chris@42 555 T25 = VSUB(T1X, T24);
Chris@42 556 T2g = VSUB(T2c, T2f);
Chris@42 557 T2h = VBYI(VSUB(T25, T2g));
Chris@42 558 T2r = VBYI(VADD(T2g, T25));
Chris@42 559 T2m = VSUB(T2k, T2l);
Chris@42 560 T2p = VSUB(T2n, T2o);
Chris@42 561 T2q = VSUB(T2m, T2p);
Chris@42 562 T2s = VADD(T2m, T2p);
Chris@42 563 }
Chris@42 564 ST(&(xo[WS(os, 10)]), VADD(T2h, T2q), ovs, &(xo[0]));
Chris@42 565 ST(&(xo[WS(os, 26)]), VSUB(T2s, T2r), ovs, &(xo[0]));
Chris@42 566 ST(&(xo[WS(os, 22)]), VSUB(T2q, T2h), ovs, &(xo[0]));
Chris@42 567 ST(&(xo[WS(os, 6)]), VADD(T2r, T2s), ovs, &(xo[0]));
Chris@42 568 }
Chris@42 569 {
Chris@42 570 V T2V, T2Z, T2Y, T30;
Chris@42 571 {
Chris@42 572 V T2T, T2U, T2W, T2X;
Chris@42 573 T2T = VADD(T2M, T2N);
Chris@42 574 T2U = VADD(T2I, T2J);
Chris@42 575 T2V = VSUB(T2T, T2U);
Chris@42 576 T2Z = VADD(T2T, T2U);
Chris@42 577 T2W = VADD(T2B, T2C);
Chris@42 578 T2X = VADD(T2E, T2F);
Chris@42 579 T2Y = VBYI(VSUB(T2W, T2X));
Chris@42 580 T30 = VADD(T2W, T2X);
Chris@42 581 }
Chris@42 582 ST(&(xo[WS(os, 24)]), VSUB(T2V, T2Y), ovs, &(xo[0]));
Chris@42 583 ST(&(xo[0]), VADD(T2Z, T30), ovs, &(xo[0]));
Chris@42 584 ST(&(xo[WS(os, 8)]), VADD(T2V, T2Y), ovs, &(xo[0]));
Chris@42 585 ST(&(xo[WS(os, 16)]), VSUB(T2Z, T30), ovs, &(xo[0]));
Chris@42 586 }
Chris@42 587 {
Chris@42 588 V T2v, T2z, T2y, T2A;
Chris@42 589 {
Chris@42 590 V T2t, T2u, T2w, T2x;
Chris@42 591 T2t = VADD(T2k, T2l);
Chris@42 592 T2u = VADD(T1X, T24);
Chris@42 593 T2v = VADD(T2t, T2u);
Chris@42 594 T2z = VSUB(T2t, T2u);
Chris@42 595 T2w = VADD(T2f, T2c);
Chris@42 596 T2x = VADD(T2n, T2o);
Chris@42 597 T2y = VBYI(VADD(T2w, T2x));
Chris@42 598 T2A = VBYI(VSUB(T2x, T2w));
Chris@42 599 }
Chris@42 600 ST(&(xo[WS(os, 30)]), VSUB(T2v, T2y), ovs, &(xo[0]));
Chris@42 601 ST(&(xo[WS(os, 14)]), VADD(T2z, T2A), ovs, &(xo[0]));
Chris@42 602 ST(&(xo[WS(os, 2)]), VADD(T2v, T2y), ovs, &(xo[0]));
Chris@42 603 ST(&(xo[WS(os, 18)]), VSUB(T2z, T2A), ovs, &(xo[0]));
Chris@42 604 }
Chris@42 605 {
Chris@42 606 V T1r, T1C, T1M, T1K, T1F, T1N, T1y, T1J;
Chris@42 607 T1r = VSUB(T1p, T1q);
Chris@42 608 T1C = VSUB(T1A, T1B);
Chris@42 609 T1M = VADD(T1p, T1q);
Chris@42 610 T1K = VADD(T1B, T1A);
Chris@42 611 {
Chris@42 612 V T1D, T1E, T1u, T1x;
Chris@42 613 T1D = VFNMS(LDK(KP195090322), T1s, VMUL(LDK(KP980785280), T1t));
Chris@42 614 T1E = VFMA(LDK(KP195090322), T1v, VMUL(LDK(KP980785280), T1w));
Chris@42 615 T1F = VSUB(T1D, T1E);
Chris@42 616 T1N = VADD(T1D, T1E);
Chris@42 617 T1u = VFMA(LDK(KP980785280), T1s, VMUL(LDK(KP195090322), T1t));
Chris@42 618 T1x = VFNMS(LDK(KP195090322), T1w, VMUL(LDK(KP980785280), T1v));
Chris@42 619 T1y = VSUB(T1u, T1x);
Chris@42 620 T1J = VADD(T1u, T1x);
Chris@42 621 }
Chris@42 622 {
Chris@42 623 V T1z, T1G, T1P, T1Q;
Chris@42 624 T1z = VADD(T1r, T1y);
Chris@42 625 T1G = VBYI(VADD(T1C, T1F));
Chris@42 626 ST(&(xo[WS(os, 25)]), VSUB(T1z, T1G), ovs, &(xo[WS(os, 1)]));
Chris@42 627 ST(&(xo[WS(os, 7)]), VADD(T1z, T1G), ovs, &(xo[WS(os, 1)]));
Chris@42 628 T1P = VBYI(VADD(T1K, T1J));
Chris@42 629 T1Q = VADD(T1M, T1N);
Chris@42 630 ST(&(xo[WS(os, 1)]), VADD(T1P, T1Q), ovs, &(xo[WS(os, 1)]));
Chris@42 631 ST(&(xo[WS(os, 31)]), VSUB(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@42 632 }
Chris@42 633 {
Chris@42 634 V T1H, T1I, T1L, T1O;
Chris@42 635 T1H = VSUB(T1r, T1y);
Chris@42 636 T1I = VBYI(VSUB(T1F, T1C));
Chris@42 637 ST(&(xo[WS(os, 23)]), VSUB(T1H, T1I), ovs, &(xo[WS(os, 1)]));
Chris@42 638 ST(&(xo[WS(os, 9)]), VADD(T1H, T1I), ovs, &(xo[WS(os, 1)]));
Chris@42 639 T1L = VBYI(VSUB(T1J, T1K));
Chris@42 640 T1O = VSUB(T1M, T1N);
Chris@42 641 ST(&(xo[WS(os, 15)]), VADD(T1L, T1O), ovs, &(xo[WS(os, 1)]));
Chris@42 642 ST(&(xo[WS(os, 17)]), VSUB(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@42 643 }
Chris@42 644 }
Chris@42 645 {
Chris@42 646 V Tr, T1a, T1k, T1i, T1d, T1l, T10, T1h;
Chris@42 647 Tr = VSUB(Tb, Tq);
Chris@42 648 T1a = VSUB(T14, T19);
Chris@42 649 T1k = VADD(Tb, Tq);
Chris@42 650 T1i = VADD(T19, T14);
Chris@42 651 {
Chris@42 652 V T1b, T1c, TI, TZ;
Chris@42 653 T1b = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
Chris@42 654 T1c = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
Chris@42 655 T1d = VSUB(T1b, T1c);
Chris@42 656 T1l = VADD(T1b, T1c);
Chris@42 657 TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
Chris@42 658 TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
Chris@42 659 T10 = VSUB(TI, TZ);
Chris@42 660 T1h = VADD(TI, TZ);
Chris@42 661 }
Chris@42 662 {
Chris@42 663 V T11, T1e, T1n, T1o;
Chris@42 664 T11 = VADD(Tr, T10);
Chris@42 665 T1e = VBYI(VADD(T1a, T1d));
Chris@42 666 ST(&(xo[WS(os, 27)]), VSUB(T11, T1e), ovs, &(xo[WS(os, 1)]));
Chris@42 667 ST(&(xo[WS(os, 5)]), VADD(T11, T1e), ovs, &(xo[WS(os, 1)]));
Chris@42 668 T1n = VBYI(VADD(T1i, T1h));
Chris@42 669 T1o = VADD(T1k, T1l);
Chris@42 670 ST(&(xo[WS(os, 3)]), VADD(T1n, T1o), ovs, &(xo[WS(os, 1)]));
Chris@42 671 ST(&(xo[WS(os, 29)]), VSUB(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@42 672 }
Chris@42 673 {
Chris@42 674 V T1f, T1g, T1j, T1m;
Chris@42 675 T1f = VSUB(Tr, T10);
Chris@42 676 T1g = VBYI(VSUB(T1d, T1a));
Chris@42 677 ST(&(xo[WS(os, 21)]), VSUB(T1f, T1g), ovs, &(xo[WS(os, 1)]));
Chris@42 678 ST(&(xo[WS(os, 11)]), VADD(T1f, T1g), ovs, &(xo[WS(os, 1)]));
Chris@42 679 T1j = VBYI(VSUB(T1h, T1i));
Chris@42 680 T1m = VSUB(T1k, T1l);
Chris@42 681 ST(&(xo[WS(os, 13)]), VADD(T1j, T1m), ovs, &(xo[WS(os, 1)]));
Chris@42 682 ST(&(xo[WS(os, 19)]), VSUB(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@42 683 }
Chris@42 684 }
Chris@42 685 }
Chris@42 686 }
Chris@42 687 VLEAVE();
Chris@42 688 }
Chris@42 689
Chris@42 690 static const kdft_desc desc = { 32, XSIMD_STRING("n1bv_32"), {170, 26, 16, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 691
Chris@42 692 void XSIMD(codelet_n1bv_32) (planner *p) {
Chris@42 693 X(kdft_register) (p, n1bv_32, &desc);
Chris@42 694 }
Chris@42 695
Chris@42 696 #endif /* HAVE_FMA */