annotate src/fftw-3.3.3/dft/simd/common/n1fv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:36:54 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n1fv_32 -include n1f.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 186 FP additions, 98 FP multiplications,
Chris@10 32 * (or, 88 additions, 0 multiplications, 98 fused multiply/add),
Chris@10 33 * 104 stack variables, 7 constants, and 64 memory accesses
Chris@10 34 */
Chris@10 35 #include "n1f.h"
Chris@10 36
Chris@10 37 static void n1fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 40 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 41 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@10 42 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@10 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 44 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 46 {
Chris@10 47 INT i;
Chris@10 48 const R *xi;
Chris@10 49 R *xo;
Chris@10 50 xi = ri;
Chris@10 51 xo = ro;
Chris@10 52 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@10 53 V T1h, Tr, T1a, T1k, TI, T1b, T1L, T1P, T1I, T1G, T1O, T1Q, T1H, T1z, T1c;
Chris@10 54 V TZ;
Chris@10 55 {
Chris@10 56 V T2x, T1T, T2K, T1W, T1p, Tb, T1A, T16, Tu, TF, T2N, T2H, T2b, T2t, TY;
Chris@10 57 V T1w, TT, T1v, T20, T2C, Tj, Te, T2h, To, T2f, T23, T2D, TB, TG, Th;
Chris@10 58 V T2i, Tk;
Chris@10 59 {
Chris@10 60 V TL, TW, TP, TQ, T2F, T27, T28, TO;
Chris@10 61 {
Chris@10 62 V T1, T2, T12, T13, T4, T5, T7, T8;
Chris@10 63 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 64 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@10 65 T12 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 66 T13 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@10 67 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 68 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@10 69 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@10 70 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@10 71 {
Chris@10 72 V TM, T25, T26, TN;
Chris@10 73 {
Chris@10 74 V TJ, T3, T14, T1U, T6, T1V, T9, TK, TU, TV, T1R, T1S, Ta, T15;
Chris@10 75 TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@10 76 T1R = VADD(T1, T2);
Chris@10 77 T3 = VSUB(T1, T2);
Chris@10 78 T1S = VADD(T12, T13);
Chris@10 79 T14 = VSUB(T12, T13);
Chris@10 80 T1U = VADD(T4, T5);
Chris@10 81 T6 = VSUB(T4, T5);
Chris@10 82 T1V = VADD(T7, T8);
Chris@10 83 T9 = VSUB(T7, T8);
Chris@10 84 TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@10 85 TU = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@10 86 T2x = VSUB(T1R, T1S);
Chris@10 87 T1T = VADD(T1R, T1S);
Chris@10 88 TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 89 TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 90 T2K = VSUB(T1V, T1U);
Chris@10 91 T1W = VADD(T1U, T1V);
Chris@10 92 Ta = VADD(T6, T9);
Chris@10 93 T15 = VSUB(T9, T6);
Chris@10 94 T25 = VADD(TJ, TK);
Chris@10 95 TL = VSUB(TJ, TK);
Chris@10 96 T26 = VADD(TV, TU);
Chris@10 97 TW = VSUB(TU, TV);
Chris@10 98 TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@10 99 TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@10 100 T1p = VFNMS(LDK(KP707106781), Ta, T3);
Chris@10 101 Tb = VFMA(LDK(KP707106781), Ta, T3);
Chris@10 102 T1A = VFMA(LDK(KP707106781), T15, T14);
Chris@10 103 T16 = VFNMS(LDK(KP707106781), T15, T14);
Chris@10 104 TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@10 105 }
Chris@10 106 T2F = VSUB(T25, T26);
Chris@10 107 T27 = VADD(T25, T26);
Chris@10 108 T28 = VADD(TM, TN);
Chris@10 109 TO = VSUB(TM, TN);
Chris@10 110 }
Chris@10 111 }
Chris@10 112 {
Chris@10 113 V Ty, T21, Tx, Tz, T1Y, T1Z;
Chris@10 114 {
Chris@10 115 V Ts, Tt, TD, T29, TR, TE, Tv, Tw;
Chris@10 116 Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 117 Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@10 118 TD = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 119 T29 = VADD(TP, TQ);
Chris@10 120 TR = VSUB(TP, TQ);
Chris@10 121 TE = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@10 122 Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 123 Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@10 124 Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@10 125 T1Y = VADD(Ts, Tt);
Chris@10 126 Tu = VSUB(Ts, Tt);
Chris@10 127 {
Chris@10 128 V T2G, T2a, TX, TS;
Chris@10 129 T2G = VSUB(T29, T28);
Chris@10 130 T2a = VADD(T28, T29);
Chris@10 131 TX = VSUB(TR, TO);
Chris@10 132 TS = VADD(TO, TR);
Chris@10 133 T1Z = VADD(TD, TE);
Chris@10 134 TF = VSUB(TD, TE);
Chris@10 135 T21 = VADD(Tv, Tw);
Chris@10 136 Tx = VSUB(Tv, Tw);
Chris@10 137 T2N = VFMA(LDK(KP414213562), T2F, T2G);
Chris@10 138 T2H = VFNMS(LDK(KP414213562), T2G, T2F);
Chris@10 139 T2b = VSUB(T27, T2a);
Chris@10 140 T2t = VADD(T27, T2a);
Chris@10 141 TY = VFMA(LDK(KP707106781), TX, TW);
Chris@10 142 T1w = VFNMS(LDK(KP707106781), TX, TW);
Chris@10 143 TT = VFMA(LDK(KP707106781), TS, TL);
Chris@10 144 T1v = VFNMS(LDK(KP707106781), TS, TL);
Chris@10 145 Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@10 146 }
Chris@10 147 }
Chris@10 148 T20 = VADD(T1Y, T1Z);
Chris@10 149 T2C = VSUB(T1Y, T1Z);
Chris@10 150 {
Chris@10 151 V Tc, Td, Tm, Tn;
Chris@10 152 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 153 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@10 154 Tm = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@10 155 Tn = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 156 {
Chris@10 157 V Tf, TA, T22, Tg;
Chris@10 158 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 159 TA = VSUB(Ty, Tz);
Chris@10 160 T22 = VADD(Ty, Tz);
Chris@10 161 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@10 162 Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@10 163 Te = VSUB(Tc, Td);
Chris@10 164 T2h = VADD(Tc, Td);
Chris@10 165 To = VSUB(Tm, Tn);
Chris@10 166 T2f = VADD(Tn, Tm);
Chris@10 167 T23 = VADD(T21, T22);
Chris@10 168 T2D = VSUB(T21, T22);
Chris@10 169 TB = VADD(Tx, TA);
Chris@10 170 TG = VSUB(Tx, TA);
Chris@10 171 Th = VSUB(Tf, Tg);
Chris@10 172 T2i = VADD(Tf, Tg);
Chris@10 173 Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@10 174 }
Chris@10 175 }
Chris@10 176 }
Chris@10 177 }
Chris@10 178 {
Chris@10 179 V T1t, TH, T1s, TC, T2P, T2U, T2n, T2d, T2w, T2u, T1q, T19, T1B, Tq, T2W;
Chris@10 180 V T2M, T2B, T2T, T2v, T2r, T2o, T2m, T2X, T2I;
Chris@10 181 {
Chris@10 182 V T1X, T2p, T2E, T2O, T2s, T2y, T2j, T17, Ti, T2e, Tl, T2c, T2l, T24;
Chris@10 183 T1X = VSUB(T1T, T1W);
Chris@10 184 T2p = VADD(T1T, T1W);
Chris@10 185 T2E = VFNMS(LDK(KP414213562), T2D, T2C);
Chris@10 186 T2O = VFMA(LDK(KP414213562), T2C, T2D);
Chris@10 187 T2s = VADD(T20, T23);
Chris@10 188 T24 = VSUB(T20, T23);
Chris@10 189 T1t = VFNMS(LDK(KP707106781), TG, TF);
Chris@10 190 TH = VFMA(LDK(KP707106781), TG, TF);
Chris@10 191 T1s = VFNMS(LDK(KP707106781), TB, Tu);
Chris@10 192 TC = VFMA(LDK(KP707106781), TB, Tu);
Chris@10 193 T2y = VSUB(T2h, T2i);
Chris@10 194 T2j = VADD(T2h, T2i);
Chris@10 195 T17 = VFMA(LDK(KP414213562), Te, Th);
Chris@10 196 Ti = VFNMS(LDK(KP414213562), Th, Te);
Chris@10 197 T2e = VADD(Tj, Tk);
Chris@10 198 Tl = VSUB(Tj, Tk);
Chris@10 199 T2c = VADD(T24, T2b);
Chris@10 200 T2l = VSUB(T2b, T24);
Chris@10 201 {
Chris@10 202 V T2L, T2A, T2q, T2k;
Chris@10 203 T2P = VSUB(T2N, T2O);
Chris@10 204 T2U = VADD(T2O, T2N);
Chris@10 205 {
Chris@10 206 V T2z, T2g, T18, Tp;
Chris@10 207 T2z = VSUB(T2e, T2f);
Chris@10 208 T2g = VADD(T2e, T2f);
Chris@10 209 T18 = VFMA(LDK(KP414213562), Tl, To);
Chris@10 210 Tp = VFNMS(LDK(KP414213562), To, Tl);
Chris@10 211 T2n = VFMA(LDK(KP707106781), T2c, T1X);
Chris@10 212 T2d = VFNMS(LDK(KP707106781), T2c, T1X);
Chris@10 213 T2w = VSUB(T2t, T2s);
Chris@10 214 T2u = VADD(T2s, T2t);
Chris@10 215 T2L = VSUB(T2z, T2y);
Chris@10 216 T2A = VADD(T2y, T2z);
Chris@10 217 T2q = VADD(T2j, T2g);
Chris@10 218 T2k = VSUB(T2g, T2j);
Chris@10 219 T1q = VADD(T17, T18);
Chris@10 220 T19 = VSUB(T17, T18);
Chris@10 221 T1B = VSUB(Tp, Ti);
Chris@10 222 Tq = VADD(Ti, Tp);
Chris@10 223 }
Chris@10 224 T2W = VFNMS(LDK(KP707106781), T2L, T2K);
Chris@10 225 T2M = VFMA(LDK(KP707106781), T2L, T2K);
Chris@10 226 T2B = VFMA(LDK(KP707106781), T2A, T2x);
Chris@10 227 T2T = VFNMS(LDK(KP707106781), T2A, T2x);
Chris@10 228 T2v = VSUB(T2p, T2q);
Chris@10 229 T2r = VADD(T2p, T2q);
Chris@10 230 T2o = VFMA(LDK(KP707106781), T2l, T2k);
Chris@10 231 T2m = VFNMS(LDK(KP707106781), T2l, T2k);
Chris@10 232 T2X = VSUB(T2H, T2E);
Chris@10 233 T2I = VADD(T2E, T2H);
Chris@10 234 }
Chris@10 235 }
Chris@10 236 {
Chris@10 237 V T2V, T2Z, T2Y, T30, T2R, T2J;
Chris@10 238 T2V = VFNMS(LDK(KP923879532), T2U, T2T);
Chris@10 239 T2Z = VFMA(LDK(KP923879532), T2U, T2T);
Chris@10 240 ST(&(xo[WS(os, 24)]), VFNMSI(T2w, T2v), ovs, &(xo[0]));
Chris@10 241 ST(&(xo[WS(os, 8)]), VFMAI(T2w, T2v), ovs, &(xo[0]));
Chris@10 242 ST(&(xo[0]), VADD(T2r, T2u), ovs, &(xo[0]));
Chris@10 243 ST(&(xo[WS(os, 16)]), VSUB(T2r, T2u), ovs, &(xo[0]));
Chris@10 244 ST(&(xo[WS(os, 28)]), VFNMSI(T2o, T2n), ovs, &(xo[0]));
Chris@10 245 ST(&(xo[WS(os, 4)]), VFMAI(T2o, T2n), ovs, &(xo[0]));
Chris@10 246 ST(&(xo[WS(os, 20)]), VFMAI(T2m, T2d), ovs, &(xo[0]));
Chris@10 247 ST(&(xo[WS(os, 12)]), VFNMSI(T2m, T2d), ovs, &(xo[0]));
Chris@10 248 T2Y = VFMA(LDK(KP923879532), T2X, T2W);
Chris@10 249 T30 = VFNMS(LDK(KP923879532), T2X, T2W);
Chris@10 250 T2R = VFMA(LDK(KP923879532), T2I, T2B);
Chris@10 251 T2J = VFNMS(LDK(KP923879532), T2I, T2B);
Chris@10 252 {
Chris@10 253 V T1J, T1r, T1C, T1M, T2S, T2Q, T1u, T1D, T1E, T1x;
Chris@10 254 T1J = VFNMS(LDK(KP923879532), T1q, T1p);
Chris@10 255 T1r = VFMA(LDK(KP923879532), T1q, T1p);
Chris@10 256 T1C = VFMA(LDK(KP923879532), T1B, T1A);
Chris@10 257 T1M = VFNMS(LDK(KP923879532), T1B, T1A);
Chris@10 258 ST(&(xo[WS(os, 6)]), VFNMSI(T30, T2Z), ovs, &(xo[0]));
Chris@10 259 ST(&(xo[WS(os, 26)]), VFMAI(T30, T2Z), ovs, &(xo[0]));
Chris@10 260 ST(&(xo[WS(os, 22)]), VFNMSI(T2Y, T2V), ovs, &(xo[0]));
Chris@10 261 ST(&(xo[WS(os, 10)]), VFMAI(T2Y, T2V), ovs, &(xo[0]));
Chris@10 262 T2S = VFMA(LDK(KP923879532), T2P, T2M);
Chris@10 263 T2Q = VFNMS(LDK(KP923879532), T2P, T2M);
Chris@10 264 T1u = VFMA(LDK(KP668178637), T1t, T1s);
Chris@10 265 T1D = VFNMS(LDK(KP668178637), T1s, T1t);
Chris@10 266 T1E = VFNMS(LDK(KP668178637), T1v, T1w);
Chris@10 267 T1x = VFMA(LDK(KP668178637), T1w, T1v);
Chris@10 268 {
Chris@10 269 V T1K, T1F, T1N, T1y;
Chris@10 270 T1h = VFNMS(LDK(KP923879532), Tq, Tb);
Chris@10 271 Tr = VFMA(LDK(KP923879532), Tq, Tb);
Chris@10 272 ST(&(xo[WS(os, 30)]), VFNMSI(T2S, T2R), ovs, &(xo[0]));
Chris@10 273 ST(&(xo[WS(os, 2)]), VFMAI(T2S, T2R), ovs, &(xo[0]));
Chris@10 274 ST(&(xo[WS(os, 18)]), VFMAI(T2Q, T2J), ovs, &(xo[0]));
Chris@10 275 ST(&(xo[WS(os, 14)]), VFNMSI(T2Q, T2J), ovs, &(xo[0]));
Chris@10 276 T1K = VADD(T1D, T1E);
Chris@10 277 T1F = VSUB(T1D, T1E);
Chris@10 278 T1N = VSUB(T1x, T1u);
Chris@10 279 T1y = VADD(T1u, T1x);
Chris@10 280 T1a = VFMA(LDK(KP923879532), T19, T16);
Chris@10 281 T1k = VFNMS(LDK(KP923879532), T19, T16);
Chris@10 282 TI = VFNMS(LDK(KP198912367), TH, TC);
Chris@10 283 T1b = VFMA(LDK(KP198912367), TC, TH);
Chris@10 284 T1L = VFMA(LDK(KP831469612), T1K, T1J);
Chris@10 285 T1P = VFNMS(LDK(KP831469612), T1K, T1J);
Chris@10 286 T1I = VFMA(LDK(KP831469612), T1F, T1C);
Chris@10 287 T1G = VFNMS(LDK(KP831469612), T1F, T1C);
Chris@10 288 T1O = VFMA(LDK(KP831469612), T1N, T1M);
Chris@10 289 T1Q = VFNMS(LDK(KP831469612), T1N, T1M);
Chris@10 290 T1H = VFMA(LDK(KP831469612), T1y, T1r);
Chris@10 291 T1z = VFNMS(LDK(KP831469612), T1y, T1r);
Chris@10 292 T1c = VFMA(LDK(KP198912367), TT, TY);
Chris@10 293 TZ = VFNMS(LDK(KP198912367), TY, TT);
Chris@10 294 }
Chris@10 295 }
Chris@10 296 }
Chris@10 297 }
Chris@10 298 }
Chris@10 299 {
Chris@10 300 V T1d, T1i, T10, T1l;
Chris@10 301 ST(&(xo[WS(os, 21)]), VFNMSI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@10 302 ST(&(xo[WS(os, 11)]), VFMAI(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@10 303 ST(&(xo[WS(os, 27)]), VFMAI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@10 304 ST(&(xo[WS(os, 5)]), VFNMSI(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@10 305 ST(&(xo[WS(os, 3)]), VFMAI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
Chris@10 306 ST(&(xo[WS(os, 29)]), VFNMSI(T1I, T1H), ovs, &(xo[WS(os, 1)]));
Chris@10 307 ST(&(xo[WS(os, 19)]), VFMAI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
Chris@10 308 ST(&(xo[WS(os, 13)]), VFNMSI(T1G, T1z), ovs, &(xo[WS(os, 1)]));
Chris@10 309 T1d = VSUB(T1b, T1c);
Chris@10 310 T1i = VADD(T1b, T1c);
Chris@10 311 T10 = VADD(TI, TZ);
Chris@10 312 T1l = VSUB(TZ, TI);
Chris@10 313 {
Chris@10 314 V T1n, T1j, T1e, T1g, T1o, T1m, T11, T1f;
Chris@10 315 T1n = VFMA(LDK(KP980785280), T1i, T1h);
Chris@10 316 T1j = VFNMS(LDK(KP980785280), T1i, T1h);
Chris@10 317 T1e = VFNMS(LDK(KP980785280), T1d, T1a);
Chris@10 318 T1g = VFMA(LDK(KP980785280), T1d, T1a);
Chris@10 319 T1o = VFMA(LDK(KP980785280), T1l, T1k);
Chris@10 320 T1m = VFNMS(LDK(KP980785280), T1l, T1k);
Chris@10 321 T11 = VFNMS(LDK(KP980785280), T10, Tr);
Chris@10 322 T1f = VFMA(LDK(KP980785280), T10, Tr);
Chris@10 323 ST(&(xo[WS(os, 23)]), VFMAI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@10 324 ST(&(xo[WS(os, 9)]), VFNMSI(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@10 325 ST(&(xo[WS(os, 25)]), VFNMSI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@10 326 ST(&(xo[WS(os, 7)]), VFMAI(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@10 327 ST(&(xo[WS(os, 31)]), VFMAI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
Chris@10 328 ST(&(xo[WS(os, 1)]), VFNMSI(T1g, T1f), ovs, &(xo[WS(os, 1)]));
Chris@10 329 ST(&(xo[WS(os, 15)]), VFMAI(T1e, T11), ovs, &(xo[WS(os, 1)]));
Chris@10 330 ST(&(xo[WS(os, 17)]), VFNMSI(T1e, T11), ovs, &(xo[WS(os, 1)]));
Chris@10 331 }
Chris@10 332 }
Chris@10 333 }
Chris@10 334 }
Chris@10 335 VLEAVE();
Chris@10 336 }
Chris@10 337
Chris@10 338 static const kdft_desc desc = { 32, XSIMD_STRING("n1fv_32"), {88, 0, 98, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 339
Chris@10 340 void XSIMD(codelet_n1fv_32) (planner *p) {
Chris@10 341 X(kdft_register) (p, n1fv_32, &desc);
Chris@10 342 }
Chris@10 343
Chris@10 344 #else /* HAVE_FMA */
Chris@10 345
Chris@10 346 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n1fv_32 -include n1f.h */
Chris@10 347
Chris@10 348 /*
Chris@10 349 * This function contains 186 FP additions, 42 FP multiplications,
Chris@10 350 * (or, 170 additions, 26 multiplications, 16 fused multiply/add),
Chris@10 351 * 58 stack variables, 7 constants, and 64 memory accesses
Chris@10 352 */
Chris@10 353 #include "n1f.h"
Chris@10 354
Chris@10 355 static void n1fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 356 {
Chris@10 357 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@10 358 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 359 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@10 360 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 361 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 362 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 363 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 364 {
Chris@10 365 INT i;
Chris@10 366 const R *xi;
Chris@10 367 R *xo;
Chris@10 368 xi = ri;
Chris@10 369 xo = ro;
Chris@10 370 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@10 371 V T1T, T1W, T2K, T2x, T16, T1A, Tb, T1p, TT, T1v, TY, T1w, T27, T2a, T2b;
Chris@10 372 V T2H, T2O, TC, T1s, TH, T1t, T20, T23, T24, T2E, T2N, T2g, T2j, Tq, T1B;
Chris@10 373 V T19, T1q, T2A, T2L;
Chris@10 374 {
Chris@10 375 V T3, T1R, T15, T1S, T6, T1U, T9, T1V, T12, Ta;
Chris@10 376 {
Chris@10 377 V T1, T2, T13, T14;
Chris@10 378 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 379 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@10 380 T3 = VSUB(T1, T2);
Chris@10 381 T1R = VADD(T1, T2);
Chris@10 382 T13 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 383 T14 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@10 384 T15 = VSUB(T13, T14);
Chris@10 385 T1S = VADD(T13, T14);
Chris@10 386 }
Chris@10 387 {
Chris@10 388 V T4, T5, T7, T8;
Chris@10 389 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 390 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@10 391 T6 = VSUB(T4, T5);
Chris@10 392 T1U = VADD(T4, T5);
Chris@10 393 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@10 394 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@10 395 T9 = VSUB(T7, T8);
Chris@10 396 T1V = VADD(T7, T8);
Chris@10 397 }
Chris@10 398 T1T = VADD(T1R, T1S);
Chris@10 399 T1W = VADD(T1U, T1V);
Chris@10 400 T2K = VSUB(T1V, T1U);
Chris@10 401 T2x = VSUB(T1R, T1S);
Chris@10 402 T12 = VMUL(LDK(KP707106781), VSUB(T9, T6));
Chris@10 403 T16 = VSUB(T12, T15);
Chris@10 404 T1A = VADD(T15, T12);
Chris@10 405 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@10 406 Tb = VADD(T3, Ta);
Chris@10 407 T1p = VSUB(T3, Ta);
Chris@10 408 }
Chris@10 409 {
Chris@10 410 V TL, T25, TX, T26, TO, T28, TR, T29;
Chris@10 411 {
Chris@10 412 V TJ, TK, TV, TW;
Chris@10 413 TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@10 414 TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@10 415 TL = VSUB(TJ, TK);
Chris@10 416 T25 = VADD(TJ, TK);
Chris@10 417 TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 418 TW = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@10 419 TX = VSUB(TV, TW);
Chris@10 420 T26 = VADD(TV, TW);
Chris@10 421 }
Chris@10 422 {
Chris@10 423 V TM, TN, TP, TQ;
Chris@10 424 TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 425 TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@10 426 TO = VSUB(TM, TN);
Chris@10 427 T28 = VADD(TM, TN);
Chris@10 428 TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@10 429 TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@10 430 TR = VSUB(TP, TQ);
Chris@10 431 T29 = VADD(TP, TQ);
Chris@10 432 }
Chris@10 433 {
Chris@10 434 V TS, TU, T2F, T2G;
Chris@10 435 TS = VMUL(LDK(KP707106781), VADD(TO, TR));
Chris@10 436 TT = VADD(TL, TS);
Chris@10 437 T1v = VSUB(TL, TS);
Chris@10 438 TU = VMUL(LDK(KP707106781), VSUB(TR, TO));
Chris@10 439 TY = VSUB(TU, TX);
Chris@10 440 T1w = VADD(TX, TU);
Chris@10 441 T27 = VADD(T25, T26);
Chris@10 442 T2a = VADD(T28, T29);
Chris@10 443 T2b = VSUB(T27, T2a);
Chris@10 444 T2F = VSUB(T25, T26);
Chris@10 445 T2G = VSUB(T29, T28);
Chris@10 446 T2H = VFNMS(LDK(KP382683432), T2G, VMUL(LDK(KP923879532), T2F));
Chris@10 447 T2O = VFMA(LDK(KP382683432), T2F, VMUL(LDK(KP923879532), T2G));
Chris@10 448 }
Chris@10 449 }
Chris@10 450 {
Chris@10 451 V Tu, T1Y, TG, T1Z, Tx, T21, TA, T22;
Chris@10 452 {
Chris@10 453 V Ts, Tt, TE, TF;
Chris@10 454 Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 455 Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@10 456 Tu = VSUB(Ts, Tt);
Chris@10 457 T1Y = VADD(Ts, Tt);
Chris@10 458 TE = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 459 TF = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@10 460 TG = VSUB(TE, TF);
Chris@10 461 T1Z = VADD(TE, TF);
Chris@10 462 }
Chris@10 463 {
Chris@10 464 V Tv, Tw, Ty, Tz;
Chris@10 465 Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 466 Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@10 467 Tx = VSUB(Tv, Tw);
Chris@10 468 T21 = VADD(Tv, Tw);
Chris@10 469 Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@10 470 Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@10 471 TA = VSUB(Ty, Tz);
Chris@10 472 T22 = VADD(Ty, Tz);
Chris@10 473 }
Chris@10 474 {
Chris@10 475 V TB, TD, T2C, T2D;
Chris@10 476 TB = VMUL(LDK(KP707106781), VADD(Tx, TA));
Chris@10 477 TC = VADD(Tu, TB);
Chris@10 478 T1s = VSUB(Tu, TB);
Chris@10 479 TD = VMUL(LDK(KP707106781), VSUB(TA, Tx));
Chris@10 480 TH = VSUB(TD, TG);
Chris@10 481 T1t = VADD(TG, TD);
Chris@10 482 T20 = VADD(T1Y, T1Z);
Chris@10 483 T23 = VADD(T21, T22);
Chris@10 484 T24 = VSUB(T20, T23);
Chris@10 485 T2C = VSUB(T1Y, T1Z);
Chris@10 486 T2D = VSUB(T22, T21);
Chris@10 487 T2E = VFMA(LDK(KP923879532), T2C, VMUL(LDK(KP382683432), T2D));
Chris@10 488 T2N = VFNMS(LDK(KP382683432), T2C, VMUL(LDK(KP923879532), T2D));
Chris@10 489 }
Chris@10 490 }
Chris@10 491 {
Chris@10 492 V Te, T2h, To, T2f, Th, T2i, Tl, T2e, Ti, Tp;
Chris@10 493 {
Chris@10 494 V Tc, Td, Tm, Tn;
Chris@10 495 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 496 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@10 497 Te = VSUB(Tc, Td);
Chris@10 498 T2h = VADD(Tc, Td);
Chris@10 499 Tm = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 500 Tn = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@10 501 To = VSUB(Tm, Tn);
Chris@10 502 T2f = VADD(Tm, Tn);
Chris@10 503 }
Chris@10 504 {
Chris@10 505 V Tf, Tg, Tj, Tk;
Chris@10 506 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 507 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@10 508 Th = VSUB(Tf, Tg);
Chris@10 509 T2i = VADD(Tf, Tg);
Chris@10 510 Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@10 511 Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@10 512 Tl = VSUB(Tj, Tk);
Chris@10 513 T2e = VADD(Tj, Tk);
Chris@10 514 }
Chris@10 515 T2g = VADD(T2e, T2f);
Chris@10 516 T2j = VADD(T2h, T2i);
Chris@10 517 Ti = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@10 518 Tp = VFMA(LDK(KP923879532), Tl, VMUL(LDK(KP382683432), To));
Chris@10 519 Tq = VADD(Ti, Tp);
Chris@10 520 T1B = VSUB(Tp, Ti);
Chris@10 521 {
Chris@10 522 V T17, T18, T2y, T2z;
Chris@10 523 T17 = VFNMS(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@10 524 T18 = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@10 525 T19 = VSUB(T17, T18);
Chris@10 526 T1q = VADD(T18, T17);
Chris@10 527 T2y = VSUB(T2h, T2i);
Chris@10 528 T2z = VSUB(T2e, T2f);
Chris@10 529 T2A = VMUL(LDK(KP707106781), VADD(T2y, T2z));
Chris@10 530 T2L = VMUL(LDK(KP707106781), VSUB(T2z, T2y));
Chris@10 531 }
Chris@10 532 }
Chris@10 533 {
Chris@10 534 V T2d, T2n, T2m, T2o;
Chris@10 535 {
Chris@10 536 V T1X, T2c, T2k, T2l;
Chris@10 537 T1X = VSUB(T1T, T1W);
Chris@10 538 T2c = VMUL(LDK(KP707106781), VADD(T24, T2b));
Chris@10 539 T2d = VADD(T1X, T2c);
Chris@10 540 T2n = VSUB(T1X, T2c);
Chris@10 541 T2k = VSUB(T2g, T2j);
Chris@10 542 T2l = VMUL(LDK(KP707106781), VSUB(T2b, T24));
Chris@10 543 T2m = VBYI(VADD(T2k, T2l));
Chris@10 544 T2o = VBYI(VSUB(T2l, T2k));
Chris@10 545 }
Chris@10 546 ST(&(xo[WS(os, 28)]), VSUB(T2d, T2m), ovs, &(xo[0]));
Chris@10 547 ST(&(xo[WS(os, 12)]), VADD(T2n, T2o), ovs, &(xo[0]));
Chris@10 548 ST(&(xo[WS(os, 4)]), VADD(T2d, T2m), ovs, &(xo[0]));
Chris@10 549 ST(&(xo[WS(os, 20)]), VSUB(T2n, T2o), ovs, &(xo[0]));
Chris@10 550 }
Chris@10 551 {
Chris@10 552 V T2r, T2v, T2u, T2w;
Chris@10 553 {
Chris@10 554 V T2p, T2q, T2s, T2t;
Chris@10 555 T2p = VADD(T1T, T1W);
Chris@10 556 T2q = VADD(T2j, T2g);
Chris@10 557 T2r = VADD(T2p, T2q);
Chris@10 558 T2v = VSUB(T2p, T2q);
Chris@10 559 T2s = VADD(T20, T23);
Chris@10 560 T2t = VADD(T27, T2a);
Chris@10 561 T2u = VADD(T2s, T2t);
Chris@10 562 T2w = VBYI(VSUB(T2t, T2s));
Chris@10 563 }
Chris@10 564 ST(&(xo[WS(os, 16)]), VSUB(T2r, T2u), ovs, &(xo[0]));
Chris@10 565 ST(&(xo[WS(os, 8)]), VADD(T2v, T2w), ovs, &(xo[0]));
Chris@10 566 ST(&(xo[0]), VADD(T2r, T2u), ovs, &(xo[0]));
Chris@10 567 ST(&(xo[WS(os, 24)]), VSUB(T2v, T2w), ovs, &(xo[0]));
Chris@10 568 }
Chris@10 569 {
Chris@10 570 V T2V, T2Z, T2Y, T30;
Chris@10 571 {
Chris@10 572 V T2T, T2U, T2W, T2X;
Chris@10 573 T2T = VSUB(T2H, T2E);
Chris@10 574 T2U = VSUB(T2L, T2K);
Chris@10 575 T2V = VBYI(VSUB(T2T, T2U));
Chris@10 576 T2Z = VBYI(VADD(T2U, T2T));
Chris@10 577 T2W = VSUB(T2x, T2A);
Chris@10 578 T2X = VSUB(T2O, T2N);
Chris@10 579 T2Y = VSUB(T2W, T2X);
Chris@10 580 T30 = VADD(T2W, T2X);
Chris@10 581 }
Chris@10 582 ST(&(xo[WS(os, 10)]), VADD(T2V, T2Y), ovs, &(xo[0]));
Chris@10 583 ST(&(xo[WS(os, 26)]), VSUB(T30, T2Z), ovs, &(xo[0]));
Chris@10 584 ST(&(xo[WS(os, 22)]), VSUB(T2Y, T2V), ovs, &(xo[0]));
Chris@10 585 ST(&(xo[WS(os, 6)]), VADD(T2Z, T30), ovs, &(xo[0]));
Chris@10 586 }
Chris@10 587 {
Chris@10 588 V T2J, T2R, T2Q, T2S;
Chris@10 589 {
Chris@10 590 V T2B, T2I, T2M, T2P;
Chris@10 591 T2B = VADD(T2x, T2A);
Chris@10 592 T2I = VADD(T2E, T2H);
Chris@10 593 T2J = VADD(T2B, T2I);
Chris@10 594 T2R = VSUB(T2B, T2I);
Chris@10 595 T2M = VADD(T2K, T2L);
Chris@10 596 T2P = VADD(T2N, T2O);
Chris@10 597 T2Q = VBYI(VADD(T2M, T2P));
Chris@10 598 T2S = VBYI(VSUB(T2P, T2M));
Chris@10 599 }
Chris@10 600 ST(&(xo[WS(os, 30)]), VSUB(T2J, T2Q), ovs, &(xo[0]));
Chris@10 601 ST(&(xo[WS(os, 14)]), VADD(T2R, T2S), ovs, &(xo[0]));
Chris@10 602 ST(&(xo[WS(os, 2)]), VADD(T2J, T2Q), ovs, &(xo[0]));
Chris@10 603 ST(&(xo[WS(os, 18)]), VSUB(T2R, T2S), ovs, &(xo[0]));
Chris@10 604 }
Chris@10 605 {
Chris@10 606 V T1r, T1C, T1M, T1K, T1F, T1N, T1y, T1J;
Chris@10 607 T1r = VADD(T1p, T1q);
Chris@10 608 T1C = VADD(T1A, T1B);
Chris@10 609 T1M = VSUB(T1p, T1q);
Chris@10 610 T1K = VSUB(T1B, T1A);
Chris@10 611 {
Chris@10 612 V T1D, T1E, T1u, T1x;
Chris@10 613 T1D = VFNMS(LDK(KP555570233), T1s, VMUL(LDK(KP831469612), T1t));
Chris@10 614 T1E = VFMA(LDK(KP555570233), T1v, VMUL(LDK(KP831469612), T1w));
Chris@10 615 T1F = VADD(T1D, T1E);
Chris@10 616 T1N = VSUB(T1E, T1D);
Chris@10 617 T1u = VFMA(LDK(KP831469612), T1s, VMUL(LDK(KP555570233), T1t));
Chris@10 618 T1x = VFNMS(LDK(KP555570233), T1w, VMUL(LDK(KP831469612), T1v));
Chris@10 619 T1y = VADD(T1u, T1x);
Chris@10 620 T1J = VSUB(T1x, T1u);
Chris@10 621 }
Chris@10 622 {
Chris@10 623 V T1z, T1G, T1P, T1Q;
Chris@10 624 T1z = VADD(T1r, T1y);
Chris@10 625 T1G = VBYI(VADD(T1C, T1F));
Chris@10 626 ST(&(xo[WS(os, 29)]), VSUB(T1z, T1G), ovs, &(xo[WS(os, 1)]));
Chris@10 627 ST(&(xo[WS(os, 3)]), VADD(T1z, T1G), ovs, &(xo[WS(os, 1)]));
Chris@10 628 T1P = VBYI(VADD(T1K, T1J));
Chris@10 629 T1Q = VADD(T1M, T1N);
Chris@10 630 ST(&(xo[WS(os, 5)]), VADD(T1P, T1Q), ovs, &(xo[WS(os, 1)]));
Chris@10 631 ST(&(xo[WS(os, 27)]), VSUB(T1Q, T1P), ovs, &(xo[WS(os, 1)]));
Chris@10 632 }
Chris@10 633 {
Chris@10 634 V T1H, T1I, T1L, T1O;
Chris@10 635 T1H = VSUB(T1r, T1y);
Chris@10 636 T1I = VBYI(VSUB(T1F, T1C));
Chris@10 637 ST(&(xo[WS(os, 19)]), VSUB(T1H, T1I), ovs, &(xo[WS(os, 1)]));
Chris@10 638 ST(&(xo[WS(os, 13)]), VADD(T1H, T1I), ovs, &(xo[WS(os, 1)]));
Chris@10 639 T1L = VBYI(VSUB(T1J, T1K));
Chris@10 640 T1O = VSUB(T1M, T1N);
Chris@10 641 ST(&(xo[WS(os, 11)]), VADD(T1L, T1O), ovs, &(xo[WS(os, 1)]));
Chris@10 642 ST(&(xo[WS(os, 21)]), VSUB(T1O, T1L), ovs, &(xo[WS(os, 1)]));
Chris@10 643 }
Chris@10 644 }
Chris@10 645 {
Chris@10 646 V Tr, T1a, T1k, T1i, T1d, T1l, T10, T1h;
Chris@10 647 Tr = VADD(Tb, Tq);
Chris@10 648 T1a = VADD(T16, T19);
Chris@10 649 T1k = VSUB(Tb, Tq);
Chris@10 650 T1i = VSUB(T19, T16);
Chris@10 651 {
Chris@10 652 V T1b, T1c, TI, TZ;
Chris@10 653 T1b = VFNMS(LDK(KP195090322), TC, VMUL(LDK(KP980785280), TH));
Chris@10 654 T1c = VFMA(LDK(KP195090322), TT, VMUL(LDK(KP980785280), TY));
Chris@10 655 T1d = VADD(T1b, T1c);
Chris@10 656 T1l = VSUB(T1c, T1b);
Chris@10 657 TI = VFMA(LDK(KP980785280), TC, VMUL(LDK(KP195090322), TH));
Chris@10 658 TZ = VFNMS(LDK(KP195090322), TY, VMUL(LDK(KP980785280), TT));
Chris@10 659 T10 = VADD(TI, TZ);
Chris@10 660 T1h = VSUB(TZ, TI);
Chris@10 661 }
Chris@10 662 {
Chris@10 663 V T11, T1e, T1n, T1o;
Chris@10 664 T11 = VADD(Tr, T10);
Chris@10 665 T1e = VBYI(VADD(T1a, T1d));
Chris@10 666 ST(&(xo[WS(os, 31)]), VSUB(T11, T1e), ovs, &(xo[WS(os, 1)]));
Chris@10 667 ST(&(xo[WS(os, 1)]), VADD(T11, T1e), ovs, &(xo[WS(os, 1)]));
Chris@10 668 T1n = VBYI(VADD(T1i, T1h));
Chris@10 669 T1o = VADD(T1k, T1l);
Chris@10 670 ST(&(xo[WS(os, 7)]), VADD(T1n, T1o), ovs, &(xo[WS(os, 1)]));
Chris@10 671 ST(&(xo[WS(os, 25)]), VSUB(T1o, T1n), ovs, &(xo[WS(os, 1)]));
Chris@10 672 }
Chris@10 673 {
Chris@10 674 V T1f, T1g, T1j, T1m;
Chris@10 675 T1f = VSUB(Tr, T10);
Chris@10 676 T1g = VBYI(VSUB(T1d, T1a));
Chris@10 677 ST(&(xo[WS(os, 17)]), VSUB(T1f, T1g), ovs, &(xo[WS(os, 1)]));
Chris@10 678 ST(&(xo[WS(os, 15)]), VADD(T1f, T1g), ovs, &(xo[WS(os, 1)]));
Chris@10 679 T1j = VBYI(VSUB(T1h, T1i));
Chris@10 680 T1m = VSUB(T1k, T1l);
Chris@10 681 ST(&(xo[WS(os, 9)]), VADD(T1j, T1m), ovs, &(xo[WS(os, 1)]));
Chris@10 682 ST(&(xo[WS(os, 23)]), VSUB(T1m, T1j), ovs, &(xo[WS(os, 1)]));
Chris@10 683 }
Chris@10 684 }
Chris@10 685 }
Chris@10 686 }
Chris@10 687 VLEAVE();
Chris@10 688 }
Chris@10 689
Chris@10 690 static const kdft_desc desc = { 32, XSIMD_STRING("n1fv_32"), {170, 26, 16, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 691
Chris@10 692 void XSIMD(codelet_n1fv_32) (planner *p) {
Chris@10 693 X(kdft_register) (p, n1fv_32, &desc);
Chris@10 694 }
Chris@10 695
Chris@10 696 #endif /* HAVE_FMA */