annotate src/fftw-3.3.5/dft/simd/common/n2bv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:40:42 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 32 -name n2bv_32 -with-ostride 2 -include n2b.h -store-multiple 2 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 186 FP additions, 98 FP multiplications,
Chris@42 32 * (or, 88 additions, 0 multiplications, 98 fused multiply/add),
Chris@42 33 * 120 stack variables, 7 constants, and 80 memory accesses
Chris@42 34 */
Chris@42 35 #include "n2b.h"
Chris@42 36
Chris@42 37 static void n2bv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 40 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 41 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 42 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 44 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 46 {
Chris@42 47 INT i;
Chris@42 48 const R *xi;
Chris@42 49 R *xo;
Chris@42 50 xi = ii;
Chris@42 51 xo = io;
Chris@42 52 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@42 53 V T31, T32, T33, T34, T35, T36, T37, T38, T39, T3a, T3b, T3c, T1h, Tr, T3d;
Chris@42 54 V T3e, T3f, T3g, T1a, T1k, TI, T1b, T1L, T1P, T1I, T1G, T1O, T1Q, T1H, T1z;
Chris@42 55 V T1c, TZ;
Chris@42 56 {
Chris@42 57 V T2x, T1T, T2K, T1W, T1p, Tb, T1A, T16, Tu, TF, T2O, T2H, T2b, T2t, TY;
Chris@42 58 V T1w, TT, T1v, T20, T2C, Tj, Te, T2e, To, T2i, T23, T2D, TB, TG, Th;
Chris@42 59 V T2f, Tk;
Chris@42 60 {
Chris@42 61 V TL, TW, TP, TQ, T2F, T27, T28, TO;
Chris@42 62 {
Chris@42 63 V T1, T2, T12, T13, T4, T5, T7, T8;
Chris@42 64 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 65 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 66 T12 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 67 T13 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@42 68 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 69 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@42 70 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@42 71 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 72 {
Chris@42 73 V TM, T25, T26, TN;
Chris@42 74 {
Chris@42 75 V TJ, T3, T14, T1U, T6, T1V, T9, TK, TU, TV, T1R, T1S, Ta, T15;
Chris@42 76 TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@42 77 T1R = VADD(T1, T2);
Chris@42 78 T3 = VSUB(T1, T2);
Chris@42 79 T1S = VADD(T12, T13);
Chris@42 80 T14 = VSUB(T12, T13);
Chris@42 81 T1U = VADD(T4, T5);
Chris@42 82 T6 = VSUB(T4, T5);
Chris@42 83 T1V = VADD(T7, T8);
Chris@42 84 T9 = VSUB(T7, T8);
Chris@42 85 TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 86 TU = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@42 87 T2x = VSUB(T1R, T1S);
Chris@42 88 T1T = VADD(T1R, T1S);
Chris@42 89 TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 90 TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 91 T2K = VSUB(T1U, T1V);
Chris@42 92 T1W = VADD(T1U, T1V);
Chris@42 93 Ta = VADD(T6, T9);
Chris@42 94 T15 = VSUB(T6, T9);
Chris@42 95 T25 = VADD(TJ, TK);
Chris@42 96 TL = VSUB(TJ, TK);
Chris@42 97 T26 = VADD(TV, TU);
Chris@42 98 TW = VSUB(TU, TV);
Chris@42 99 TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 100 TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@42 101 T1p = VFNMS(LDK(KP707106781), Ta, T3);
Chris@42 102 Tb = VFMA(LDK(KP707106781), Ta, T3);
Chris@42 103 T1A = VFNMS(LDK(KP707106781), T15, T14);
Chris@42 104 T16 = VFMA(LDK(KP707106781), T15, T14);
Chris@42 105 TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 106 }
Chris@42 107 T2F = VSUB(T25, T26);
Chris@42 108 T27 = VADD(T25, T26);
Chris@42 109 T28 = VADD(TM, TN);
Chris@42 110 TO = VSUB(TM, TN);
Chris@42 111 }
Chris@42 112 }
Chris@42 113 {
Chris@42 114 V Ty, T21, Tx, Tz, T1Y, T1Z;
Chris@42 115 {
Chris@42 116 V Ts, Tt, TD, T29, TR, TE, Tv, Tw;
Chris@42 117 Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 118 Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 119 TD = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 120 T29 = VADD(TP, TQ);
Chris@42 121 TR = VSUB(TP, TQ);
Chris@42 122 TE = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@42 123 Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 124 Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@42 125 Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@42 126 T1Y = VADD(Ts, Tt);
Chris@42 127 Tu = VSUB(Ts, Tt);
Chris@42 128 {
Chris@42 129 V T2G, T2a, TX, TS;
Chris@42 130 T2G = VSUB(T29, T28);
Chris@42 131 T2a = VADD(T28, T29);
Chris@42 132 TX = VSUB(TR, TO);
Chris@42 133 TS = VADD(TO, TR);
Chris@42 134 T1Z = VADD(TD, TE);
Chris@42 135 TF = VSUB(TD, TE);
Chris@42 136 T21 = VADD(Tv, Tw);
Chris@42 137 Tx = VSUB(Tv, Tw);
Chris@42 138 T2O = VFMA(LDK(KP414213562), T2F, T2G);
Chris@42 139 T2H = VFNMS(LDK(KP414213562), T2G, T2F);
Chris@42 140 T2b = VSUB(T27, T2a);
Chris@42 141 T2t = VADD(T27, T2a);
Chris@42 142 TY = VFMA(LDK(KP707106781), TX, TW);
Chris@42 143 T1w = VFNMS(LDK(KP707106781), TX, TW);
Chris@42 144 TT = VFMA(LDK(KP707106781), TS, TL);
Chris@42 145 T1v = VFNMS(LDK(KP707106781), TS, TL);
Chris@42 146 Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 147 }
Chris@42 148 }
Chris@42 149 T20 = VADD(T1Y, T1Z);
Chris@42 150 T2C = VSUB(T1Y, T1Z);
Chris@42 151 {
Chris@42 152 V Tc, Td, Tm, Tn;
Chris@42 153 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 154 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 155 Tm = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@42 156 Tn = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 157 {
Chris@42 158 V Tf, TA, T22, Tg;
Chris@42 159 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 160 TA = VSUB(Ty, Tz);
Chris@42 161 T22 = VADD(Ty, Tz);
Chris@42 162 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@42 163 Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@42 164 Te = VSUB(Tc, Td);
Chris@42 165 T2e = VADD(Tc, Td);
Chris@42 166 To = VSUB(Tm, Tn);
Chris@42 167 T2i = VADD(Tn, Tm);
Chris@42 168 T23 = VADD(T21, T22);
Chris@42 169 T2D = VSUB(T21, T22);
Chris@42 170 TB = VADD(Tx, TA);
Chris@42 171 TG = VSUB(Tx, TA);
Chris@42 172 Th = VSUB(Tf, Tg);
Chris@42 173 T2f = VADD(Tf, Tg);
Chris@42 174 Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 175 }
Chris@42 176 }
Chris@42 177 }
Chris@42 178 }
Chris@42 179 {
Chris@42 180 V T1t, TH, T1s, TC, T2P, T2U, T2n, T2d, T2w, T2u, T1q, T19, T1B, Tq, T2W;
Chris@42 181 V T2M, T2B, T2T, T2v, T2r, T2o, T2m, T2X, T2I;
Chris@42 182 {
Chris@42 183 V T1X, T2p, T2E, T2N, T2s, T2y, T2g, T17, Ti, T2h, Tl, T2c, T2l, T24;
Chris@42 184 T1X = VSUB(T1T, T1W);
Chris@42 185 T2p = VADD(T1T, T1W);
Chris@42 186 T2E = VFNMS(LDK(KP414213562), T2D, T2C);
Chris@42 187 T2N = VFMA(LDK(KP414213562), T2C, T2D);
Chris@42 188 T2s = VADD(T20, T23);
Chris@42 189 T24 = VSUB(T20, T23);
Chris@42 190 T1t = VFNMS(LDK(KP707106781), TG, TF);
Chris@42 191 TH = VFMA(LDK(KP707106781), TG, TF);
Chris@42 192 T1s = VFNMS(LDK(KP707106781), TB, Tu);
Chris@42 193 TC = VFMA(LDK(KP707106781), TB, Tu);
Chris@42 194 T2y = VSUB(T2e, T2f);
Chris@42 195 T2g = VADD(T2e, T2f);
Chris@42 196 T17 = VFMA(LDK(KP414213562), Te, Th);
Chris@42 197 Ti = VFNMS(LDK(KP414213562), Th, Te);
Chris@42 198 T2h = VADD(Tj, Tk);
Chris@42 199 Tl = VSUB(Tj, Tk);
Chris@42 200 T2c = VADD(T24, T2b);
Chris@42 201 T2l = VSUB(T24, T2b);
Chris@42 202 {
Chris@42 203 V T2L, T2A, T2q, T2k;
Chris@42 204 T2P = VSUB(T2N, T2O);
Chris@42 205 T2U = VADD(T2N, T2O);
Chris@42 206 {
Chris@42 207 V T2z, T2j, T18, Tp;
Chris@42 208 T2z = VSUB(T2h, T2i);
Chris@42 209 T2j = VADD(T2h, T2i);
Chris@42 210 T18 = VFMA(LDK(KP414213562), Tl, To);
Chris@42 211 Tp = VFNMS(LDK(KP414213562), To, Tl);
Chris@42 212 T2n = VFMA(LDK(KP707106781), T2c, T1X);
Chris@42 213 T2d = VFNMS(LDK(KP707106781), T2c, T1X);
Chris@42 214 T2w = VADD(T2s, T2t);
Chris@42 215 T2u = VSUB(T2s, T2t);
Chris@42 216 T2L = VSUB(T2y, T2z);
Chris@42 217 T2A = VADD(T2y, T2z);
Chris@42 218 T2q = VADD(T2g, T2j);
Chris@42 219 T2k = VSUB(T2g, T2j);
Chris@42 220 T1q = VADD(T17, T18);
Chris@42 221 T19 = VSUB(T17, T18);
Chris@42 222 T1B = VSUB(Ti, Tp);
Chris@42 223 Tq = VADD(Ti, Tp);
Chris@42 224 }
Chris@42 225 T2W = VFNMS(LDK(KP707106781), T2L, T2K);
Chris@42 226 T2M = VFMA(LDK(KP707106781), T2L, T2K);
Chris@42 227 T2B = VFMA(LDK(KP707106781), T2A, T2x);
Chris@42 228 T2T = VFNMS(LDK(KP707106781), T2A, T2x);
Chris@42 229 T2v = VADD(T2p, T2q);
Chris@42 230 T2r = VSUB(T2p, T2q);
Chris@42 231 T2o = VFMA(LDK(KP707106781), T2l, T2k);
Chris@42 232 T2m = VFNMS(LDK(KP707106781), T2l, T2k);
Chris@42 233 T2X = VSUB(T2E, T2H);
Chris@42 234 T2I = VADD(T2E, T2H);
Chris@42 235 }
Chris@42 236 }
Chris@42 237 {
Chris@42 238 V T2V, T2Z, T2Y, T30, T2R, T2J;
Chris@42 239 T2V = VFNMS(LDK(KP923879532), T2U, T2T);
Chris@42 240 T2Z = VFMA(LDK(KP923879532), T2U, T2T);
Chris@42 241 T31 = VSUB(T2v, T2w);
Chris@42 242 STM2(&(xo[32]), T31, ovs, &(xo[0]));
Chris@42 243 T32 = VADD(T2v, T2w);
Chris@42 244 STM2(&(xo[0]), T32, ovs, &(xo[0]));
Chris@42 245 T33 = VFMAI(T2u, T2r);
Chris@42 246 STM2(&(xo[16]), T33, ovs, &(xo[0]));
Chris@42 247 T34 = VFNMSI(T2u, T2r);
Chris@42 248 STM2(&(xo[48]), T34, ovs, &(xo[0]));
Chris@42 249 T35 = VFMAI(T2o, T2n);
Chris@42 250 STM2(&(xo[8]), T35, ovs, &(xo[0]));
Chris@42 251 T36 = VFNMSI(T2o, T2n);
Chris@42 252 STM2(&(xo[56]), T36, ovs, &(xo[0]));
Chris@42 253 T37 = VFMAI(T2m, T2d);
Chris@42 254 STM2(&(xo[40]), T37, ovs, &(xo[0]));
Chris@42 255 T38 = VFNMSI(T2m, T2d);
Chris@42 256 STM2(&(xo[24]), T38, ovs, &(xo[0]));
Chris@42 257 T2Y = VFMA(LDK(KP923879532), T2X, T2W);
Chris@42 258 T30 = VFNMS(LDK(KP923879532), T2X, T2W);
Chris@42 259 T2R = VFMA(LDK(KP923879532), T2I, T2B);
Chris@42 260 T2J = VFNMS(LDK(KP923879532), T2I, T2B);
Chris@42 261 {
Chris@42 262 V T1J, T1r, T1C, T1M, T2S, T2Q, T1u, T1D, T1E, T1x;
Chris@42 263 T1J = VFNMS(LDK(KP923879532), T1q, T1p);
Chris@42 264 T1r = VFMA(LDK(KP923879532), T1q, T1p);
Chris@42 265 T1C = VFNMS(LDK(KP923879532), T1B, T1A);
Chris@42 266 T1M = VFMA(LDK(KP923879532), T1B, T1A);
Chris@42 267 T39 = VFNMSI(T30, T2Z);
Chris@42 268 STM2(&(xo[12]), T39, ovs, &(xo[0]));
Chris@42 269 T3a = VFMAI(T30, T2Z);
Chris@42 270 STM2(&(xo[52]), T3a, ovs, &(xo[0]));
Chris@42 271 T3b = VFNMSI(T2Y, T2V);
Chris@42 272 STM2(&(xo[44]), T3b, ovs, &(xo[0]));
Chris@42 273 T3c = VFMAI(T2Y, T2V);
Chris@42 274 STM2(&(xo[20]), T3c, ovs, &(xo[0]));
Chris@42 275 T2S = VFMA(LDK(KP923879532), T2P, T2M);
Chris@42 276 T2Q = VFNMS(LDK(KP923879532), T2P, T2M);
Chris@42 277 T1u = VFMA(LDK(KP668178637), T1t, T1s);
Chris@42 278 T1D = VFNMS(LDK(KP668178637), T1s, T1t);
Chris@42 279 T1E = VFNMS(LDK(KP668178637), T1v, T1w);
Chris@42 280 T1x = VFMA(LDK(KP668178637), T1w, T1v);
Chris@42 281 {
Chris@42 282 V T1K, T1F, T1N, T1y;
Chris@42 283 T1h = VFNMS(LDK(KP923879532), Tq, Tb);
Chris@42 284 Tr = VFMA(LDK(KP923879532), Tq, Tb);
Chris@42 285 T3d = VFNMSI(T2S, T2R);
Chris@42 286 STM2(&(xo[60]), T3d, ovs, &(xo[0]));
Chris@42 287 T3e = VFMAI(T2S, T2R);
Chris@42 288 STM2(&(xo[4]), T3e, ovs, &(xo[0]));
Chris@42 289 T3f = VFMAI(T2Q, T2J);
Chris@42 290 STM2(&(xo[36]), T3f, ovs, &(xo[0]));
Chris@42 291 T3g = VFNMSI(T2Q, T2J);
Chris@42 292 STM2(&(xo[28]), T3g, ovs, &(xo[0]));
Chris@42 293 T1K = VADD(T1D, T1E);
Chris@42 294 T1F = VSUB(T1D, T1E);
Chris@42 295 T1N = VSUB(T1u, T1x);
Chris@42 296 T1y = VADD(T1u, T1x);
Chris@42 297 T1a = VFMA(LDK(KP923879532), T19, T16);
Chris@42 298 T1k = VFNMS(LDK(KP923879532), T19, T16);
Chris@42 299 TI = VFNMS(LDK(KP198912367), TH, TC);
Chris@42 300 T1b = VFMA(LDK(KP198912367), TC, TH);
Chris@42 301 T1L = VFMA(LDK(KP831469612), T1K, T1J);
Chris@42 302 T1P = VFNMS(LDK(KP831469612), T1K, T1J);
Chris@42 303 T1I = VFMA(LDK(KP831469612), T1F, T1C);
Chris@42 304 T1G = VFNMS(LDK(KP831469612), T1F, T1C);
Chris@42 305 T1O = VFNMS(LDK(KP831469612), T1N, T1M);
Chris@42 306 T1Q = VFMA(LDK(KP831469612), T1N, T1M);
Chris@42 307 T1H = VFMA(LDK(KP831469612), T1y, T1r);
Chris@42 308 T1z = VFNMS(LDK(KP831469612), T1y, T1r);
Chris@42 309 T1c = VFMA(LDK(KP198912367), TT, TY);
Chris@42 310 TZ = VFNMS(LDK(KP198912367), TY, TT);
Chris@42 311 }
Chris@42 312 }
Chris@42 313 }
Chris@42 314 }
Chris@42 315 }
Chris@42 316 {
Chris@42 317 V T1d, T1i, T10, T1l;
Chris@42 318 {
Chris@42 319 V T3h, T3i, T3j, T3k;
Chris@42 320 T3h = VFMAI(T1O, T1L);
Chris@42 321 STM2(&(xo[42]), T3h, ovs, &(xo[2]));
Chris@42 322 STN2(&(xo[40]), T37, T3h, ovs);
Chris@42 323 T3i = VFNMSI(T1O, T1L);
Chris@42 324 STM2(&(xo[22]), T3i, ovs, &(xo[2]));
Chris@42 325 STN2(&(xo[20]), T3c, T3i, ovs);
Chris@42 326 T3j = VFNMSI(T1Q, T1P);
Chris@42 327 STM2(&(xo[54]), T3j, ovs, &(xo[2]));
Chris@42 328 STN2(&(xo[52]), T3a, T3j, ovs);
Chris@42 329 T3k = VFMAI(T1Q, T1P);
Chris@42 330 STM2(&(xo[10]), T3k, ovs, &(xo[2]));
Chris@42 331 STN2(&(xo[8]), T35, T3k, ovs);
Chris@42 332 {
Chris@42 333 V T3l, T3m, T3n, T3o;
Chris@42 334 T3l = VFMAI(T1I, T1H);
Chris@42 335 STM2(&(xo[58]), T3l, ovs, &(xo[2]));
Chris@42 336 STN2(&(xo[56]), T36, T3l, ovs);
Chris@42 337 T3m = VFNMSI(T1I, T1H);
Chris@42 338 STM2(&(xo[6]), T3m, ovs, &(xo[2]));
Chris@42 339 STN2(&(xo[4]), T3e, T3m, ovs);
Chris@42 340 T3n = VFMAI(T1G, T1z);
Chris@42 341 STM2(&(xo[26]), T3n, ovs, &(xo[2]));
Chris@42 342 STN2(&(xo[24]), T38, T3n, ovs);
Chris@42 343 T3o = VFNMSI(T1G, T1z);
Chris@42 344 STM2(&(xo[38]), T3o, ovs, &(xo[2]));
Chris@42 345 STN2(&(xo[36]), T3f, T3o, ovs);
Chris@42 346 T1d = VSUB(T1b, T1c);
Chris@42 347 T1i = VADD(T1b, T1c);
Chris@42 348 T10 = VADD(TI, TZ);
Chris@42 349 T1l = VSUB(TI, TZ);
Chris@42 350 }
Chris@42 351 }
Chris@42 352 {
Chris@42 353 V T1n, T1j, T1e, T1g, T1o, T1m, T11, T1f;
Chris@42 354 T1n = VFMA(LDK(KP980785280), T1i, T1h);
Chris@42 355 T1j = VFNMS(LDK(KP980785280), T1i, T1h);
Chris@42 356 T1e = VFNMS(LDK(KP980785280), T1d, T1a);
Chris@42 357 T1g = VFMA(LDK(KP980785280), T1d, T1a);
Chris@42 358 T1o = VFNMS(LDK(KP980785280), T1l, T1k);
Chris@42 359 T1m = VFMA(LDK(KP980785280), T1l, T1k);
Chris@42 360 T11 = VFNMS(LDK(KP980785280), T10, Tr);
Chris@42 361 T1f = VFMA(LDK(KP980785280), T10, Tr);
Chris@42 362 {
Chris@42 363 V T3p, T3q, T3r, T3s;
Chris@42 364 T3p = VFNMSI(T1m, T1j);
Chris@42 365 STM2(&(xo[46]), T3p, ovs, &(xo[2]));
Chris@42 366 STN2(&(xo[44]), T3b, T3p, ovs);
Chris@42 367 T3q = VFMAI(T1m, T1j);
Chris@42 368 STM2(&(xo[18]), T3q, ovs, &(xo[2]));
Chris@42 369 STN2(&(xo[16]), T33, T3q, ovs);
Chris@42 370 T3r = VFMAI(T1o, T1n);
Chris@42 371 STM2(&(xo[50]), T3r, ovs, &(xo[2]));
Chris@42 372 STN2(&(xo[48]), T34, T3r, ovs);
Chris@42 373 T3s = VFNMSI(T1o, T1n);
Chris@42 374 STM2(&(xo[14]), T3s, ovs, &(xo[2]));
Chris@42 375 STN2(&(xo[12]), T39, T3s, ovs);
Chris@42 376 {
Chris@42 377 V T3t, T3u, T3v, T3w;
Chris@42 378 T3t = VFMAI(T1g, T1f);
Chris@42 379 STM2(&(xo[2]), T3t, ovs, &(xo[2]));
Chris@42 380 STN2(&(xo[0]), T32, T3t, ovs);
Chris@42 381 T3u = VFNMSI(T1g, T1f);
Chris@42 382 STM2(&(xo[62]), T3u, ovs, &(xo[2]));
Chris@42 383 STN2(&(xo[60]), T3d, T3u, ovs);
Chris@42 384 T3v = VFMAI(T1e, T11);
Chris@42 385 STM2(&(xo[34]), T3v, ovs, &(xo[2]));
Chris@42 386 STN2(&(xo[32]), T31, T3v, ovs);
Chris@42 387 T3w = VFNMSI(T1e, T11);
Chris@42 388 STM2(&(xo[30]), T3w, ovs, &(xo[2]));
Chris@42 389 STN2(&(xo[28]), T3g, T3w, ovs);
Chris@42 390 }
Chris@42 391 }
Chris@42 392 }
Chris@42 393 }
Chris@42 394 }
Chris@42 395 }
Chris@42 396 VLEAVE();
Chris@42 397 }
Chris@42 398
Chris@42 399 static const kdft_desc desc = { 32, XSIMD_STRING("n2bv_32"), {88, 0, 98, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 400
Chris@42 401 void XSIMD(codelet_n2bv_32) (planner *p) {
Chris@42 402 X(kdft_register) (p, n2bv_32, &desc);
Chris@42 403 }
Chris@42 404
Chris@42 405 #else /* HAVE_FMA */
Chris@42 406
Chris@42 407 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 32 -name n2bv_32 -with-ostride 2 -include n2b.h -store-multiple 2 */
Chris@42 408
Chris@42 409 /*
Chris@42 410 * This function contains 186 FP additions, 42 FP multiplications,
Chris@42 411 * (or, 170 additions, 26 multiplications, 16 fused multiply/add),
Chris@42 412 * 72 stack variables, 7 constants, and 80 memory accesses
Chris@42 413 */
Chris@42 414 #include "n2b.h"
Chris@42 415
Chris@42 416 static void n2bv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 417 {
Chris@42 418 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@42 419 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 420 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@42 421 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 422 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 423 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 424 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 425 {
Chris@42 426 INT i;
Chris@42 427 const R *xi;
Chris@42 428 R *xo;
Chris@42 429 xi = ii;
Chris@42 430 xo = io;
Chris@42 431 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@42 432 V T2f, T2k, T2N, T2M, T19, T1B, Tb, T1p, TT, T1v, TY, T1w, T2E, T2F, T2G;
Chris@42 433 V T24, T2o, TC, T1s, TH, T1t, T2B, T2C, T2D, T1X, T2n, T2I, T2J, Tq, T1A;
Chris@42 434 V T14, T1q, T2c, T2l;
Chris@42 435 {
Chris@42 436 V T3, T2i, T18, T2j, T6, T2d, T9, T2e, T15, Ta;
Chris@42 437 {
Chris@42 438 V T1, T2, T16, T17;
Chris@42 439 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 440 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 441 T3 = VSUB(T1, T2);
Chris@42 442 T2i = VADD(T1, T2);
Chris@42 443 T16 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 444 T17 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@42 445 T18 = VSUB(T16, T17);
Chris@42 446 T2j = VADD(T16, T17);
Chris@42 447 }
Chris@42 448 {
Chris@42 449 V T4, T5, T7, T8;
Chris@42 450 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 451 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@42 452 T6 = VSUB(T4, T5);
Chris@42 453 T2d = VADD(T4, T5);
Chris@42 454 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@42 455 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 456 T9 = VSUB(T7, T8);
Chris@42 457 T2e = VADD(T7, T8);
Chris@42 458 }
Chris@42 459 T2f = VSUB(T2d, T2e);
Chris@42 460 T2k = VSUB(T2i, T2j);
Chris@42 461 T2N = VADD(T2d, T2e);
Chris@42 462 T2M = VADD(T2i, T2j);
Chris@42 463 T15 = VMUL(LDK(KP707106781), VSUB(T6, T9));
Chris@42 464 T19 = VSUB(T15, T18);
Chris@42 465 T1B = VADD(T18, T15);
Chris@42 466 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@42 467 Tb = VSUB(T3, Ta);
Chris@42 468 T1p = VADD(T3, Ta);
Chris@42 469 }
Chris@42 470 {
Chris@42 471 V TL, T21, TW, T1Y, TO, T22, TS, T1Z;
Chris@42 472 {
Chris@42 473 V TJ, TK, TU, TV;
Chris@42 474 TJ = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 475 TK = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 476 TL = VSUB(TJ, TK);
Chris@42 477 T21 = VADD(TJ, TK);
Chris@42 478 TU = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@42 479 TV = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 480 TW = VSUB(TU, TV);
Chris@42 481 T1Y = VADD(TU, TV);
Chris@42 482 }
Chris@42 483 {
Chris@42 484 V TM, TN, TQ, TR;
Chris@42 485 TM = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@42 486 TN = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 487 TO = VSUB(TM, TN);
Chris@42 488 T22 = VADD(TM, TN);
Chris@42 489 TQ = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 490 TR = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@42 491 TS = VSUB(TQ, TR);
Chris@42 492 T1Z = VADD(TQ, TR);
Chris@42 493 }
Chris@42 494 {
Chris@42 495 V TP, TX, T20, T23;
Chris@42 496 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
Chris@42 497 TT = VSUB(TP, TS);
Chris@42 498 T1v = VADD(TS, TP);
Chris@42 499 TX = VMUL(LDK(KP707106781), VADD(TL, TO));
Chris@42 500 TY = VSUB(TW, TX);
Chris@42 501 T1w = VADD(TW, TX);
Chris@42 502 T2E = VADD(T1Y, T1Z);
Chris@42 503 T2F = VADD(T21, T22);
Chris@42 504 T2G = VSUB(T2E, T2F);
Chris@42 505 T20 = VSUB(T1Y, T1Z);
Chris@42 506 T23 = VSUB(T21, T22);
Chris@42 507 T24 = VFMA(LDK(KP923879532), T20, VMUL(LDK(KP382683432), T23));
Chris@42 508 T2o = VFNMS(LDK(KP382683432), T20, VMUL(LDK(KP923879532), T23));
Chris@42 509 }
Chris@42 510 }
Chris@42 511 {
Chris@42 512 V Tu, T1U, TF, T1R, Tx, T1V, TB, T1S;
Chris@42 513 {
Chris@42 514 V Ts, Tt, TD, TE;
Chris@42 515 Ts = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 516 Tt = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@42 517 Tu = VSUB(Ts, Tt);
Chris@42 518 T1U = VADD(Ts, Tt);
Chris@42 519 TD = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 520 TE = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 521 TF = VSUB(TD, TE);
Chris@42 522 T1R = VADD(TD, TE);
Chris@42 523 }
Chris@42 524 {
Chris@42 525 V Tv, Tw, Tz, TA;
Chris@42 526 Tv = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@42 527 Tw = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 528 Tx = VSUB(Tv, Tw);
Chris@42 529 T1V = VADD(Tv, Tw);
Chris@42 530 Tz = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 531 TA = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@42 532 TB = VSUB(Tz, TA);
Chris@42 533 T1S = VADD(Tz, TA);
Chris@42 534 }
Chris@42 535 {
Chris@42 536 V Ty, TG, T1T, T1W;
Chris@42 537 Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
Chris@42 538 TC = VSUB(Ty, TB);
Chris@42 539 T1s = VADD(TB, Ty);
Chris@42 540 TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
Chris@42 541 TH = VSUB(TF, TG);
Chris@42 542 T1t = VADD(TF, TG);
Chris@42 543 T2B = VADD(T1R, T1S);
Chris@42 544 T2C = VADD(T1U, T1V);
Chris@42 545 T2D = VSUB(T2B, T2C);
Chris@42 546 T1T = VSUB(T1R, T1S);
Chris@42 547 T1W = VSUB(T1U, T1V);
Chris@42 548 T1X = VFNMS(LDK(KP382683432), T1W, VMUL(LDK(KP923879532), T1T));
Chris@42 549 T2n = VFMA(LDK(KP382683432), T1T, VMUL(LDK(KP923879532), T1W));
Chris@42 550 }
Chris@42 551 }
Chris@42 552 {
Chris@42 553 V Te, T26, To, T29, Th, T27, Tl, T2a, Ti, Tp;
Chris@42 554 {
Chris@42 555 V Tc, Td, Tm, Tn;
Chris@42 556 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 557 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 558 Te = VSUB(Tc, Td);
Chris@42 559 T26 = VADD(Tc, Td);
Chris@42 560 Tm = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@42 561 Tn = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 562 To = VSUB(Tm, Tn);
Chris@42 563 T29 = VADD(Tm, Tn);
Chris@42 564 }
Chris@42 565 {
Chris@42 566 V Tf, Tg, Tj, Tk;
Chris@42 567 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 568 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@42 569 Th = VSUB(Tf, Tg);
Chris@42 570 T27 = VADD(Tf, Tg);
Chris@42 571 Tj = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 572 Tk = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@42 573 Tl = VSUB(Tj, Tk);
Chris@42 574 T2a = VADD(Tj, Tk);
Chris@42 575 }
Chris@42 576 T2I = VADD(T26, T27);
Chris@42 577 T2J = VADD(T29, T2a);
Chris@42 578 Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@42 579 Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
Chris@42 580 Tq = VSUB(Ti, Tp);
Chris@42 581 T1A = VADD(Ti, Tp);
Chris@42 582 {
Chris@42 583 V T12, T13, T28, T2b;
Chris@42 584 T12 = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@42 585 T13 = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@42 586 T14 = VSUB(T12, T13);
Chris@42 587 T1q = VADD(T12, T13);
Chris@42 588 T28 = VSUB(T26, T27);
Chris@42 589 T2b = VSUB(T29, T2a);
Chris@42 590 T2c = VMUL(LDK(KP707106781), VSUB(T28, T2b));
Chris@42 591 T2l = VMUL(LDK(KP707106781), VADD(T28, T2b));
Chris@42 592 }
Chris@42 593 }
Chris@42 594 {
Chris@42 595 V T31, T32, T33, T34, T35, T36, T37, T38, T39, T3a, T3b, T3c;
Chris@42 596 {
Chris@42 597 V T2L, T2R, T2Q, T2S;
Chris@42 598 {
Chris@42 599 V T2H, T2K, T2O, T2P;
Chris@42 600 T2H = VMUL(LDK(KP707106781), VSUB(T2D, T2G));
Chris@42 601 T2K = VSUB(T2I, T2J);
Chris@42 602 T2L = VBYI(VSUB(T2H, T2K));
Chris@42 603 T2R = VBYI(VADD(T2K, T2H));
Chris@42 604 T2O = VSUB(T2M, T2N);
Chris@42 605 T2P = VMUL(LDK(KP707106781), VADD(T2D, T2G));
Chris@42 606 T2Q = VSUB(T2O, T2P);
Chris@42 607 T2S = VADD(T2O, T2P);
Chris@42 608 }
Chris@42 609 T31 = VADD(T2L, T2Q);
Chris@42 610 STM2(&(xo[24]), T31, ovs, &(xo[0]));
Chris@42 611 T32 = VSUB(T2S, T2R);
Chris@42 612 STM2(&(xo[56]), T32, ovs, &(xo[0]));
Chris@42 613 T33 = VSUB(T2Q, T2L);
Chris@42 614 STM2(&(xo[40]), T33, ovs, &(xo[0]));
Chris@42 615 T34 = VADD(T2R, T2S);
Chris@42 616 STM2(&(xo[8]), T34, ovs, &(xo[0]));
Chris@42 617 }
Chris@42 618 {
Chris@42 619 V T2h, T2r, T2q, T2s;
Chris@42 620 {
Chris@42 621 V T25, T2g, T2m, T2p;
Chris@42 622 T25 = VSUB(T1X, T24);
Chris@42 623 T2g = VSUB(T2c, T2f);
Chris@42 624 T2h = VBYI(VSUB(T25, T2g));
Chris@42 625 T2r = VBYI(VADD(T2g, T25));
Chris@42 626 T2m = VSUB(T2k, T2l);
Chris@42 627 T2p = VSUB(T2n, T2o);
Chris@42 628 T2q = VSUB(T2m, T2p);
Chris@42 629 T2s = VADD(T2m, T2p);
Chris@42 630 }
Chris@42 631 T35 = VADD(T2h, T2q);
Chris@42 632 STM2(&(xo[20]), T35, ovs, &(xo[0]));
Chris@42 633 T36 = VSUB(T2s, T2r);
Chris@42 634 STM2(&(xo[52]), T36, ovs, &(xo[0]));
Chris@42 635 T37 = VSUB(T2q, T2h);
Chris@42 636 STM2(&(xo[44]), T37, ovs, &(xo[0]));
Chris@42 637 T38 = VADD(T2r, T2s);
Chris@42 638 STM2(&(xo[12]), T38, ovs, &(xo[0]));
Chris@42 639 }
Chris@42 640 {
Chris@42 641 V T2V, T2Z, T2Y, T30;
Chris@42 642 {
Chris@42 643 V T2T, T2U, T2W, T2X;
Chris@42 644 T2T = VADD(T2M, T2N);
Chris@42 645 T2U = VADD(T2I, T2J);
Chris@42 646 T2V = VSUB(T2T, T2U);
Chris@42 647 T2Z = VADD(T2T, T2U);
Chris@42 648 T2W = VADD(T2B, T2C);
Chris@42 649 T2X = VADD(T2E, T2F);
Chris@42 650 T2Y = VBYI(VSUB(T2W, T2X));
Chris@42 651 T30 = VADD(T2W, T2X);
Chris@42 652 }
Chris@42 653 T39 = VSUB(T2V, T2Y);
Chris@42 654 STM2(&(xo[48]), T39, ovs, &(xo[0]));
Chris@42 655 T3a = VADD(T2Z, T30);
Chris@42 656 STM2(&(xo[0]), T3a, ovs, &(xo[0]));
Chris@42 657 T3b = VADD(T2V, T2Y);
Chris@42 658 STM2(&(xo[16]), T3b, ovs, &(xo[0]));
Chris@42 659 T3c = VSUB(T2Z, T30);
Chris@42 660 STM2(&(xo[32]), T3c, ovs, &(xo[0]));
Chris@42 661 }
Chris@42 662 {
Chris@42 663 V T3d, T3e, T3f, T3g;
Chris@42 664 {
Chris@42 665 V T2v, T2z, T2y, T2A;
Chris@42 666 {
Chris@42 667 V T2t, T2u, T2w, T2x;
Chris@42 668 T2t = VADD(T2k, T2l);
Chris@42 669 T2u = VADD(T1X, T24);
Chris@42 670 T2v = VADD(T2t, T2u);
Chris@42 671 T2z = VSUB(T2t, T2u);
Chris@42 672 T2w = VADD(T2f, T2c);
Chris@42 673 T2x = VADD(T2n, T2o);
Chris@42 674 T2y = VBYI(VADD(T2w, T2x));
Chris@42 675 T2A = VBYI(VSUB(T2x, T2w));
Chris@42 676 }
Chris@42 677 T3d = VSUB(T2v, T2y);
Chris@42 678 STM2(&(xo[60]), T3d, ovs, &(xo[0]));
Chris@42 679 T3e = VADD(T2z, T2A);
Chris@42 680 STM2(&(xo[28]), T3e, ovs, &(xo[0]));
Chris@42 681 T3f = VADD(T2v, T2y);
Chris@42 682 STM2(&(xo[4]), T3f, ovs, &(xo[0]));
Chris@42 683 T3g = VSUB(T2z, T2A);
Chris@42 684 STM2(&(xo[36]), T3g, ovs, &(xo[0]));
Chris@42 685 }
Chris@42 686 {
Chris@42 687 V T1r, T1C, T1M, T1K, T1F, T1N, T1y, T1J;
Chris@42 688 T1r = VSUB(T1p, T1q);
Chris@42 689 T1C = VSUB(T1A, T1B);
Chris@42 690 T1M = VADD(T1p, T1q);
Chris@42 691 T1K = VADD(T1B, T1A);
Chris@42 692 {
Chris@42 693 V T1D, T1E, T1u, T1x;
Chris@42 694 T1D = VFNMS(LDK(KP195090322), T1s, VMUL(LDK(KP980785280), T1t));
Chris@42 695 T1E = VFMA(LDK(KP195090322), T1v, VMUL(LDK(KP980785280), T1w));
Chris@42 696 T1F = VSUB(T1D, T1E);
Chris@42 697 T1N = VADD(T1D, T1E);
Chris@42 698 T1u = VFMA(LDK(KP980785280), T1s, VMUL(LDK(KP195090322), T1t));
Chris@42 699 T1x = VFNMS(LDK(KP195090322), T1w, VMUL(LDK(KP980785280), T1v));
Chris@42 700 T1y = VSUB(T1u, T1x);
Chris@42 701 T1J = VADD(T1u, T1x);
Chris@42 702 }
Chris@42 703 {
Chris@42 704 V T1z, T1G, T3h, T3i;
Chris@42 705 T1z = VADD(T1r, T1y);
Chris@42 706 T1G = VBYI(VADD(T1C, T1F));
Chris@42 707 T3h = VSUB(T1z, T1G);
Chris@42 708 STM2(&(xo[50]), T3h, ovs, &(xo[2]));
Chris@42 709 STN2(&(xo[48]), T39, T3h, ovs);
Chris@42 710 T3i = VADD(T1z, T1G);
Chris@42 711 STM2(&(xo[14]), T3i, ovs, &(xo[2]));
Chris@42 712 STN2(&(xo[12]), T38, T3i, ovs);
Chris@42 713 }
Chris@42 714 {
Chris@42 715 V T1P, T1Q, T3j, T3k;
Chris@42 716 T1P = VBYI(VADD(T1K, T1J));
Chris@42 717 T1Q = VADD(T1M, T1N);
Chris@42 718 T3j = VADD(T1P, T1Q);
Chris@42 719 STM2(&(xo[2]), T3j, ovs, &(xo[2]));
Chris@42 720 STN2(&(xo[0]), T3a, T3j, ovs);
Chris@42 721 T3k = VSUB(T1Q, T1P);
Chris@42 722 STM2(&(xo[62]), T3k, ovs, &(xo[2]));
Chris@42 723 STN2(&(xo[60]), T3d, T3k, ovs);
Chris@42 724 }
Chris@42 725 {
Chris@42 726 V T1H, T1I, T3l, T3m;
Chris@42 727 T1H = VSUB(T1r, T1y);
Chris@42 728 T1I = VBYI(VSUB(T1F, T1C));
Chris@42 729 T3l = VSUB(T1H, T1I);
Chris@42 730 STM2(&(xo[46]), T3l, ovs, &(xo[2]));
Chris@42 731 STN2(&(xo[44]), T37, T3l, ovs);
Chris@42 732 T3m = VADD(T1H, T1I);
Chris@42 733 STM2(&(xo[18]), T3m, ovs, &(xo[2]));
Chris@42 734 STN2(&(xo[16]), T3b, T3m, ovs);
Chris@42 735 }
Chris@42 736 {
Chris@42 737 V T1L, T1O, T3n, T3o;
Chris@42 738 T1L = VBYI(VSUB(T1J, T1K));
Chris@42 739 T1O = VSUB(T1M, T1N);
Chris@42 740 T3n = VADD(T1L, T1O);
Chris@42 741 STM2(&(xo[30]), T3n, ovs, &(xo[2]));
Chris@42 742 STN2(&(xo[28]), T3e, T3n, ovs);
Chris@42 743 T3o = VSUB(T1O, T1L);
Chris@42 744 STM2(&(xo[34]), T3o, ovs, &(xo[2]));
Chris@42 745 STN2(&(xo[32]), T3c, T3o, ovs);
Chris@42 746 }
Chris@42 747 }
Chris@42 748 {
Chris@42 749 V Tr, T1a, T1k, T1i, T1d, T1l, T10, T1h;
Chris@42 750 Tr = VSUB(Tb, Tq);
Chris@42 751 T1a = VSUB(T14, T19);
Chris@42 752 T1k = VADD(Tb, Tq);
Chris@42 753 T1i = VADD(T19, T14);
Chris@42 754 {
Chris@42 755 V T1b, T1c, TI, TZ;
Chris@42 756 T1b = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
Chris@42 757 T1c = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
Chris@42 758 T1d = VSUB(T1b, T1c);
Chris@42 759 T1l = VADD(T1b, T1c);
Chris@42 760 TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
Chris@42 761 TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
Chris@42 762 T10 = VSUB(TI, TZ);
Chris@42 763 T1h = VADD(TI, TZ);
Chris@42 764 }
Chris@42 765 {
Chris@42 766 V T11, T1e, T3p, T3q;
Chris@42 767 T11 = VADD(Tr, T10);
Chris@42 768 T1e = VBYI(VADD(T1a, T1d));
Chris@42 769 T3p = VSUB(T11, T1e);
Chris@42 770 STM2(&(xo[54]), T3p, ovs, &(xo[2]));
Chris@42 771 STN2(&(xo[52]), T36, T3p, ovs);
Chris@42 772 T3q = VADD(T11, T1e);
Chris@42 773 STM2(&(xo[10]), T3q, ovs, &(xo[2]));
Chris@42 774 STN2(&(xo[8]), T34, T3q, ovs);
Chris@42 775 }
Chris@42 776 {
Chris@42 777 V T1n, T1o, T3r, T3s;
Chris@42 778 T1n = VBYI(VADD(T1i, T1h));
Chris@42 779 T1o = VADD(T1k, T1l);
Chris@42 780 T3r = VADD(T1n, T1o);
Chris@42 781 STM2(&(xo[6]), T3r, ovs, &(xo[2]));
Chris@42 782 STN2(&(xo[4]), T3f, T3r, ovs);
Chris@42 783 T3s = VSUB(T1o, T1n);
Chris@42 784 STM2(&(xo[58]), T3s, ovs, &(xo[2]));
Chris@42 785 STN2(&(xo[56]), T32, T3s, ovs);
Chris@42 786 }
Chris@42 787 {
Chris@42 788 V T1f, T1g, T3t, T3u;
Chris@42 789 T1f = VSUB(Tr, T10);
Chris@42 790 T1g = VBYI(VSUB(T1d, T1a));
Chris@42 791 T3t = VSUB(T1f, T1g);
Chris@42 792 STM2(&(xo[42]), T3t, ovs, &(xo[2]));
Chris@42 793 STN2(&(xo[40]), T33, T3t, ovs);
Chris@42 794 T3u = VADD(T1f, T1g);
Chris@42 795 STM2(&(xo[22]), T3u, ovs, &(xo[2]));
Chris@42 796 STN2(&(xo[20]), T35, T3u, ovs);
Chris@42 797 }
Chris@42 798 {
Chris@42 799 V T1j, T1m, T3v, T3w;
Chris@42 800 T1j = VBYI(VSUB(T1h, T1i));
Chris@42 801 T1m = VSUB(T1k, T1l);
Chris@42 802 T3v = VADD(T1j, T1m);
Chris@42 803 STM2(&(xo[26]), T3v, ovs, &(xo[2]));
Chris@42 804 STN2(&(xo[24]), T31, T3v, ovs);
Chris@42 805 T3w = VSUB(T1m, T1j);
Chris@42 806 STM2(&(xo[38]), T3w, ovs, &(xo[2]));
Chris@42 807 STN2(&(xo[36]), T3g, T3w, ovs);
Chris@42 808 }
Chris@42 809 }
Chris@42 810 }
Chris@42 811 }
Chris@42 812 }
Chris@42 813 }
Chris@42 814 VLEAVE();
Chris@42 815 }
Chris@42 816
Chris@42 817 static const kdft_desc desc = { 32, XSIMD_STRING("n2bv_32"), {170, 26, 16, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 818
Chris@42 819 void XSIMD(codelet_n2bv_32) (planner *p) {
Chris@42 820 X(kdft_register) (p, n2bv_32, &desc);
Chris@42 821 }
Chris@42 822
Chris@42 823 #endif /* HAVE_FMA */