annotate src/fftw-3.3.8/dft/simd/common/n2fv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:08 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n2fv_32 -with-ostride 2 -include dft/simd/n2f.h -store-multiple 2 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 186 FP additions, 98 FP multiplications,
Chris@82 32 * (or, 88 additions, 0 multiplications, 98 fused multiply/add),
Chris@82 33 * 72 stack variables, 7 constants, and 80 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n2f.h"
Chris@82 36
Chris@82 37 static void n2fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 40 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 41 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 42 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 44 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 46 {
Chris@82 47 INT i;
Chris@82 48 const R *xi;
Chris@82 49 R *xo;
Chris@82 50 xi = ri;
Chris@82 51 xo = ro;
Chris@82 52 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@82 53 V T1T, T1W, T2K, T2x, T16, T1A, Tb, T1p, TT, T1v, TY, T1w, T27, T2a, T2b;
Chris@82 54 V T2H, T2N, TC, T1s, TH, T1t, T20, T23, T24, T2E, T2O, T2g, T2j, Tq, T1B;
Chris@82 55 V T19, T1q, T2A, T2L;
Chris@82 56 {
Chris@82 57 V T3, T1R, T14, T1S, T6, T1U, T9, T1V, T15, Ta;
Chris@82 58 {
Chris@82 59 V T1, T2, T12, T13;
Chris@82 60 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 61 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 62 T3 = VSUB(T1, T2);
Chris@82 63 T1R = VADD(T1, T2);
Chris@82 64 T12 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 65 T13 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 66 T14 = VSUB(T12, T13);
Chris@82 67 T1S = VADD(T12, T13);
Chris@82 68 }
Chris@82 69 {
Chris@82 70 V T4, T5, T7, T8;
Chris@82 71 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 72 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 73 T6 = VSUB(T4, T5);
Chris@82 74 T1U = VADD(T4, T5);
Chris@82 75 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 76 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 77 T9 = VSUB(T7, T8);
Chris@82 78 T1V = VADD(T7, T8);
Chris@82 79 }
Chris@82 80 T1T = VADD(T1R, T1S);
Chris@82 81 T1W = VADD(T1U, T1V);
Chris@82 82 T2K = VSUB(T1V, T1U);
Chris@82 83 T2x = VSUB(T1R, T1S);
Chris@82 84 T15 = VSUB(T9, T6);
Chris@82 85 T16 = VFNMS(LDK(KP707106781), T15, T14);
Chris@82 86 T1A = VFMA(LDK(KP707106781), T15, T14);
Chris@82 87 Ta = VADD(T6, T9);
Chris@82 88 Tb = VFMA(LDK(KP707106781), Ta, T3);
Chris@82 89 T1p = VFNMS(LDK(KP707106781), Ta, T3);
Chris@82 90 }
Chris@82 91 {
Chris@82 92 V TL, T25, TW, T26, TO, T28, TR, T29;
Chris@82 93 {
Chris@82 94 V TJ, TK, TU, TV;
Chris@82 95 TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 96 TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 97 TL = VSUB(TJ, TK);
Chris@82 98 T25 = VADD(TJ, TK);
Chris@82 99 TU = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 100 TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 101 TW = VSUB(TU, TV);
Chris@82 102 T26 = VADD(TV, TU);
Chris@82 103 }
Chris@82 104 {
Chris@82 105 V TM, TN, TP, TQ;
Chris@82 106 TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 107 TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 108 TO = VSUB(TM, TN);
Chris@82 109 T28 = VADD(TM, TN);
Chris@82 110 TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 111 TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 112 TR = VSUB(TP, TQ);
Chris@82 113 T29 = VADD(TP, TQ);
Chris@82 114 }
Chris@82 115 {
Chris@82 116 V TS, TX, T2F, T2G;
Chris@82 117 TS = VADD(TO, TR);
Chris@82 118 TT = VFMA(LDK(KP707106781), TS, TL);
Chris@82 119 T1v = VFNMS(LDK(KP707106781), TS, TL);
Chris@82 120 TX = VSUB(TR, TO);
Chris@82 121 TY = VFMA(LDK(KP707106781), TX, TW);
Chris@82 122 T1w = VFNMS(LDK(KP707106781), TX, TW);
Chris@82 123 T27 = VADD(T25, T26);
Chris@82 124 T2a = VADD(T28, T29);
Chris@82 125 T2b = VSUB(T27, T2a);
Chris@82 126 T2F = VSUB(T25, T26);
Chris@82 127 T2G = VSUB(T29, T28);
Chris@82 128 T2H = VFNMS(LDK(KP414213562), T2G, T2F);
Chris@82 129 T2N = VFMA(LDK(KP414213562), T2F, T2G);
Chris@82 130 }
Chris@82 131 }
Chris@82 132 {
Chris@82 133 V Tu, T1Y, TF, T1Z, Tx, T21, TA, T22;
Chris@82 134 {
Chris@82 135 V Ts, Tt, TD, TE;
Chris@82 136 Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 137 Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 138 Tu = VSUB(Ts, Tt);
Chris@82 139 T1Y = VADD(Ts, Tt);
Chris@82 140 TD = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 141 TE = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 142 TF = VSUB(TD, TE);
Chris@82 143 T1Z = VADD(TD, TE);
Chris@82 144 }
Chris@82 145 {
Chris@82 146 V Tv, Tw, Ty, Tz;
Chris@82 147 Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 148 Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 149 Tx = VSUB(Tv, Tw);
Chris@82 150 T21 = VADD(Tv, Tw);
Chris@82 151 Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 152 Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 153 TA = VSUB(Ty, Tz);
Chris@82 154 T22 = VADD(Ty, Tz);
Chris@82 155 }
Chris@82 156 {
Chris@82 157 V TB, TG, T2C, T2D;
Chris@82 158 TB = VADD(Tx, TA);
Chris@82 159 TC = VFMA(LDK(KP707106781), TB, Tu);
Chris@82 160 T1s = VFNMS(LDK(KP707106781), TB, Tu);
Chris@82 161 TG = VSUB(Tx, TA);
Chris@82 162 TH = VFMA(LDK(KP707106781), TG, TF);
Chris@82 163 T1t = VFNMS(LDK(KP707106781), TG, TF);
Chris@82 164 T20 = VADD(T1Y, T1Z);
Chris@82 165 T23 = VADD(T21, T22);
Chris@82 166 T24 = VSUB(T20, T23);
Chris@82 167 T2C = VSUB(T1Y, T1Z);
Chris@82 168 T2D = VSUB(T21, T22);
Chris@82 169 T2E = VFNMS(LDK(KP414213562), T2D, T2C);
Chris@82 170 T2O = VFMA(LDK(KP414213562), T2C, T2D);
Chris@82 171 }
Chris@82 172 }
Chris@82 173 {
Chris@82 174 V Te, T2h, To, T2f, Th, T2i, Tl, T2e, Ti, Tp;
Chris@82 175 {
Chris@82 176 V Tc, Td, Tm, Tn;
Chris@82 177 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 178 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 179 Te = VSUB(Tc, Td);
Chris@82 180 T2h = VADD(Tc, Td);
Chris@82 181 Tm = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 182 Tn = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 183 To = VSUB(Tm, Tn);
Chris@82 184 T2f = VADD(Tn, Tm);
Chris@82 185 }
Chris@82 186 {
Chris@82 187 V Tf, Tg, Tj, Tk;
Chris@82 188 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 189 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 190 Th = VSUB(Tf, Tg);
Chris@82 191 T2i = VADD(Tf, Tg);
Chris@82 192 Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 193 Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 194 Tl = VSUB(Tj, Tk);
Chris@82 195 T2e = VADD(Tj, Tk);
Chris@82 196 }
Chris@82 197 T2g = VADD(T2e, T2f);
Chris@82 198 T2j = VADD(T2h, T2i);
Chris@82 199 Ti = VFNMS(LDK(KP414213562), Th, Te);
Chris@82 200 Tp = VFNMS(LDK(KP414213562), To, Tl);
Chris@82 201 Tq = VADD(Ti, Tp);
Chris@82 202 T1B = VSUB(Tp, Ti);
Chris@82 203 {
Chris@82 204 V T17, T18, T2y, T2z;
Chris@82 205 T17 = VFMA(LDK(KP414213562), Te, Th);
Chris@82 206 T18 = VFMA(LDK(KP414213562), Tl, To);
Chris@82 207 T19 = VSUB(T17, T18);
Chris@82 208 T1q = VADD(T17, T18);
Chris@82 209 T2y = VSUB(T2h, T2i);
Chris@82 210 T2z = VSUB(T2e, T2f);
Chris@82 211 T2A = VADD(T2y, T2z);
Chris@82 212 T2L = VSUB(T2z, T2y);
Chris@82 213 }
Chris@82 214 }
Chris@82 215 {
Chris@82 216 V T31, T32, T33, T34, T35, T36, T37, T38, T39, T3a, T3b, T3c;
Chris@82 217 {
Chris@82 218 V T2d, T2n, T2m, T2o;
Chris@82 219 {
Chris@82 220 V T1X, T2c, T2k, T2l;
Chris@82 221 T1X = VSUB(T1T, T1W);
Chris@82 222 T2c = VADD(T24, T2b);
Chris@82 223 T2d = VFNMS(LDK(KP707106781), T2c, T1X);
Chris@82 224 T2n = VFMA(LDK(KP707106781), T2c, T1X);
Chris@82 225 T2k = VSUB(T2g, T2j);
Chris@82 226 T2l = VSUB(T2b, T24);
Chris@82 227 T2m = VFNMS(LDK(KP707106781), T2l, T2k);
Chris@82 228 T2o = VFMA(LDK(KP707106781), T2l, T2k);
Chris@82 229 }
Chris@82 230 T31 = VFNMSI(T2m, T2d);
Chris@82 231 STM2(&(xo[24]), T31, ovs, &(xo[0]));
Chris@82 232 T32 = VFMAI(T2o, T2n);
Chris@82 233 STM2(&(xo[8]), T32, ovs, &(xo[0]));
Chris@82 234 T33 = VFMAI(T2m, T2d);
Chris@82 235 STM2(&(xo[40]), T33, ovs, &(xo[0]));
Chris@82 236 T34 = VFNMSI(T2o, T2n);
Chris@82 237 STM2(&(xo[56]), T34, ovs, &(xo[0]));
Chris@82 238 }
Chris@82 239 {
Chris@82 240 V T2r, T2v, T2u, T2w;
Chris@82 241 {
Chris@82 242 V T2p, T2q, T2s, T2t;
Chris@82 243 T2p = VADD(T1T, T1W);
Chris@82 244 T2q = VADD(T2j, T2g);
Chris@82 245 T2r = VADD(T2p, T2q);
Chris@82 246 T2v = VSUB(T2p, T2q);
Chris@82 247 T2s = VADD(T20, T23);
Chris@82 248 T2t = VADD(T27, T2a);
Chris@82 249 T2u = VADD(T2s, T2t);
Chris@82 250 T2w = VSUB(T2t, T2s);
Chris@82 251 }
Chris@82 252 T35 = VSUB(T2r, T2u);
Chris@82 253 STM2(&(xo[32]), T35, ovs, &(xo[0]));
Chris@82 254 T36 = VFMAI(T2w, T2v);
Chris@82 255 STM2(&(xo[16]), T36, ovs, &(xo[0]));
Chris@82 256 T37 = VADD(T2r, T2u);
Chris@82 257 STM2(&(xo[0]), T37, ovs, &(xo[0]));
Chris@82 258 T38 = VFNMSI(T2w, T2v);
Chris@82 259 STM2(&(xo[48]), T38, ovs, &(xo[0]));
Chris@82 260 }
Chris@82 261 {
Chris@82 262 V T2V, T2Z, T2Y, T30;
Chris@82 263 {
Chris@82 264 V T2T, T2U, T2W, T2X;
Chris@82 265 T2T = VFNMS(LDK(KP707106781), T2A, T2x);
Chris@82 266 T2U = VADD(T2O, T2N);
Chris@82 267 T2V = VFNMS(LDK(KP923879532), T2U, T2T);
Chris@82 268 T2Z = VFMA(LDK(KP923879532), T2U, T2T);
Chris@82 269 T2W = VFNMS(LDK(KP707106781), T2L, T2K);
Chris@82 270 T2X = VSUB(T2H, T2E);
Chris@82 271 T2Y = VFMA(LDK(KP923879532), T2X, T2W);
Chris@82 272 T30 = VFNMS(LDK(KP923879532), T2X, T2W);
Chris@82 273 }
Chris@82 274 T39 = VFMAI(T2Y, T2V);
Chris@82 275 STM2(&(xo[20]), T39, ovs, &(xo[0]));
Chris@82 276 T3a = VFMAI(T30, T2Z);
Chris@82 277 STM2(&(xo[52]), T3a, ovs, &(xo[0]));
Chris@82 278 T3b = VFNMSI(T2Y, T2V);
Chris@82 279 STM2(&(xo[44]), T3b, ovs, &(xo[0]));
Chris@82 280 T3c = VFNMSI(T30, T2Z);
Chris@82 281 STM2(&(xo[12]), T3c, ovs, &(xo[0]));
Chris@82 282 }
Chris@82 283 {
Chris@82 284 V T3d, T3e, T3f, T3g;
Chris@82 285 {
Chris@82 286 V T2J, T2R, T2Q, T2S;
Chris@82 287 {
Chris@82 288 V T2B, T2I, T2M, T2P;
Chris@82 289 T2B = VFMA(LDK(KP707106781), T2A, T2x);
Chris@82 290 T2I = VADD(T2E, T2H);
Chris@82 291 T2J = VFNMS(LDK(KP923879532), T2I, T2B);
Chris@82 292 T2R = VFMA(LDK(KP923879532), T2I, T2B);
Chris@82 293 T2M = VFMA(LDK(KP707106781), T2L, T2K);
Chris@82 294 T2P = VSUB(T2N, T2O);
Chris@82 295 T2Q = VFNMS(LDK(KP923879532), T2P, T2M);
Chris@82 296 T2S = VFMA(LDK(KP923879532), T2P, T2M);
Chris@82 297 }
Chris@82 298 T3d = VFNMSI(T2Q, T2J);
Chris@82 299 STM2(&(xo[28]), T3d, ovs, &(xo[0]));
Chris@82 300 T3e = VFMAI(T2S, T2R);
Chris@82 301 STM2(&(xo[4]), T3e, ovs, &(xo[0]));
Chris@82 302 T3f = VFMAI(T2Q, T2J);
Chris@82 303 STM2(&(xo[36]), T3f, ovs, &(xo[0]));
Chris@82 304 T3g = VFNMSI(T2S, T2R);
Chris@82 305 STM2(&(xo[60]), T3g, ovs, &(xo[0]));
Chris@82 306 }
Chris@82 307 {
Chris@82 308 V T1r, T1C, T1M, T1J, T1F, T1K, T1y, T1N;
Chris@82 309 T1r = VFMA(LDK(KP923879532), T1q, T1p);
Chris@82 310 T1C = VFMA(LDK(KP923879532), T1B, T1A);
Chris@82 311 T1M = VFNMS(LDK(KP923879532), T1B, T1A);
Chris@82 312 T1J = VFNMS(LDK(KP923879532), T1q, T1p);
Chris@82 313 {
Chris@82 314 V T1D, T1E, T1u, T1x;
Chris@82 315 T1D = VFNMS(LDK(KP668178637), T1s, T1t);
Chris@82 316 T1E = VFNMS(LDK(KP668178637), T1v, T1w);
Chris@82 317 T1F = VSUB(T1D, T1E);
Chris@82 318 T1K = VADD(T1D, T1E);
Chris@82 319 T1u = VFMA(LDK(KP668178637), T1t, T1s);
Chris@82 320 T1x = VFMA(LDK(KP668178637), T1w, T1v);
Chris@82 321 T1y = VADD(T1u, T1x);
Chris@82 322 T1N = VSUB(T1x, T1u);
Chris@82 323 }
Chris@82 324 {
Chris@82 325 V T1z, T1G, T3h, T3i;
Chris@82 326 T1z = VFNMS(LDK(KP831469612), T1y, T1r);
Chris@82 327 T1G = VFNMS(LDK(KP831469612), T1F, T1C);
Chris@82 328 T3h = VFNMSI(T1G, T1z);
Chris@82 329 STM2(&(xo[26]), T3h, ovs, &(xo[2]));
Chris@82 330 STN2(&(xo[24]), T31, T3h, ovs);
Chris@82 331 T3i = VFMAI(T1G, T1z);
Chris@82 332 STM2(&(xo[38]), T3i, ovs, &(xo[2]));
Chris@82 333 STN2(&(xo[36]), T3f, T3i, ovs);
Chris@82 334 }
Chris@82 335 {
Chris@82 336 V T1P, T1Q, T3j, T3k;
Chris@82 337 T1P = VFNMS(LDK(KP831469612), T1K, T1J);
Chris@82 338 T1Q = VFNMS(LDK(KP831469612), T1N, T1M);
Chris@82 339 T3j = VFNMSI(T1Q, T1P);
Chris@82 340 STM2(&(xo[10]), T3j, ovs, &(xo[2]));
Chris@82 341 STN2(&(xo[8]), T32, T3j, ovs);
Chris@82 342 T3k = VFMAI(T1Q, T1P);
Chris@82 343 STM2(&(xo[54]), T3k, ovs, &(xo[2]));
Chris@82 344 STN2(&(xo[52]), T3a, T3k, ovs);
Chris@82 345 }
Chris@82 346 {
Chris@82 347 V T1H, T1I, T3l, T3m;
Chris@82 348 T1H = VFMA(LDK(KP831469612), T1y, T1r);
Chris@82 349 T1I = VFMA(LDK(KP831469612), T1F, T1C);
Chris@82 350 T3l = VFNMSI(T1I, T1H);
Chris@82 351 STM2(&(xo[58]), T3l, ovs, &(xo[2]));
Chris@82 352 STN2(&(xo[56]), T34, T3l, ovs);
Chris@82 353 T3m = VFMAI(T1I, T1H);
Chris@82 354 STM2(&(xo[6]), T3m, ovs, &(xo[2]));
Chris@82 355 STN2(&(xo[4]), T3e, T3m, ovs);
Chris@82 356 }
Chris@82 357 {
Chris@82 358 V T1L, T1O, T3n, T3o;
Chris@82 359 T1L = VFMA(LDK(KP831469612), T1K, T1J);
Chris@82 360 T1O = VFMA(LDK(KP831469612), T1N, T1M);
Chris@82 361 T3n = VFMAI(T1O, T1L);
Chris@82 362 STM2(&(xo[22]), T3n, ovs, &(xo[2]));
Chris@82 363 STN2(&(xo[20]), T39, T3n, ovs);
Chris@82 364 T3o = VFNMSI(T1O, T1L);
Chris@82 365 STM2(&(xo[42]), T3o, ovs, &(xo[2]));
Chris@82 366 STN2(&(xo[40]), T33, T3o, ovs);
Chris@82 367 }
Chris@82 368 }
Chris@82 369 {
Chris@82 370 V Tr, T1a, T1k, T1h, T1d, T1i, T10, T1l;
Chris@82 371 Tr = VFMA(LDK(KP923879532), Tq, Tb);
Chris@82 372 T1a = VFMA(LDK(KP923879532), T19, T16);
Chris@82 373 T1k = VFNMS(LDK(KP923879532), T19, T16);
Chris@82 374 T1h = VFNMS(LDK(KP923879532), Tq, Tb);
Chris@82 375 {
Chris@82 376 V T1b, T1c, TI, TZ;
Chris@82 377 T1b = VFMA(LDK(KP198912367), TC, TH);
Chris@82 378 T1c = VFMA(LDK(KP198912367), TT, TY);
Chris@82 379 T1d = VSUB(T1b, T1c);
Chris@82 380 T1i = VADD(T1b, T1c);
Chris@82 381 TI = VFNMS(LDK(KP198912367), TH, TC);
Chris@82 382 TZ = VFNMS(LDK(KP198912367), TY, TT);
Chris@82 383 T10 = VADD(TI, TZ);
Chris@82 384 T1l = VSUB(TZ, TI);
Chris@82 385 }
Chris@82 386 {
Chris@82 387 V T11, T1e, T3p, T3q;
Chris@82 388 T11 = VFNMS(LDK(KP980785280), T10, Tr);
Chris@82 389 T1e = VFNMS(LDK(KP980785280), T1d, T1a);
Chris@82 390 T3p = VFNMSI(T1e, T11);
Chris@82 391 STM2(&(xo[34]), T3p, ovs, &(xo[2]));
Chris@82 392 STN2(&(xo[32]), T35, T3p, ovs);
Chris@82 393 T3q = VFMAI(T1e, T11);
Chris@82 394 STM2(&(xo[30]), T3q, ovs, &(xo[2]));
Chris@82 395 STN2(&(xo[28]), T3d, T3q, ovs);
Chris@82 396 }
Chris@82 397 {
Chris@82 398 V T1n, T1o, T3r, T3s;
Chris@82 399 T1n = VFMA(LDK(KP980785280), T1i, T1h);
Chris@82 400 T1o = VFMA(LDK(KP980785280), T1l, T1k);
Chris@82 401 T3r = VFMAI(T1o, T1n);
Chris@82 402 STM2(&(xo[14]), T3r, ovs, &(xo[2]));
Chris@82 403 STN2(&(xo[12]), T3c, T3r, ovs);
Chris@82 404 T3s = VFNMSI(T1o, T1n);
Chris@82 405 STM2(&(xo[50]), T3s, ovs, &(xo[2]));
Chris@82 406 STN2(&(xo[48]), T38, T3s, ovs);
Chris@82 407 }
Chris@82 408 {
Chris@82 409 V T1f, T1g, T3t, T3u;
Chris@82 410 T1f = VFMA(LDK(KP980785280), T10, Tr);
Chris@82 411 T1g = VFMA(LDK(KP980785280), T1d, T1a);
Chris@82 412 T3t = VFNMSI(T1g, T1f);
Chris@82 413 STM2(&(xo[2]), T3t, ovs, &(xo[2]));
Chris@82 414 STN2(&(xo[0]), T37, T3t, ovs);
Chris@82 415 T3u = VFMAI(T1g, T1f);
Chris@82 416 STM2(&(xo[62]), T3u, ovs, &(xo[2]));
Chris@82 417 STN2(&(xo[60]), T3g, T3u, ovs);
Chris@82 418 }
Chris@82 419 {
Chris@82 420 V T1j, T1m, T3v, T3w;
Chris@82 421 T1j = VFNMS(LDK(KP980785280), T1i, T1h);
Chris@82 422 T1m = VFNMS(LDK(KP980785280), T1l, T1k);
Chris@82 423 T3v = VFNMSI(T1m, T1j);
Chris@82 424 STM2(&(xo[18]), T3v, ovs, &(xo[2]));
Chris@82 425 STN2(&(xo[16]), T36, T3v, ovs);
Chris@82 426 T3w = VFMAI(T1m, T1j);
Chris@82 427 STM2(&(xo[46]), T3w, ovs, &(xo[2]));
Chris@82 428 STN2(&(xo[44]), T3b, T3w, ovs);
Chris@82 429 }
Chris@82 430 }
Chris@82 431 }
Chris@82 432 }
Chris@82 433 }
Chris@82 434 }
Chris@82 435 VLEAVE();
Chris@82 436 }
Chris@82 437
Chris@82 438 static const kdft_desc desc = { 32, XSIMD_STRING("n2fv_32"), {88, 0, 98, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 439
Chris@82 440 void XSIMD(codelet_n2fv_32) (planner *p) {
Chris@82 441 X(kdft_register) (p, n2fv_32, &desc);
Chris@82 442 }
Chris@82 443
Chris@82 444 #else
Chris@82 445
Chris@82 446 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n2fv_32 -with-ostride 2 -include dft/simd/n2f.h -store-multiple 2 */
Chris@82 447
Chris@82 448 /*
Chris@82 449 * This function contains 186 FP additions, 42 FP multiplications,
Chris@82 450 * (or, 170 additions, 26 multiplications, 16 fused multiply/add),
Chris@82 451 * 72 stack variables, 7 constants, and 80 memory accesses
Chris@82 452 */
Chris@82 453 #include "dft/simd/n2f.h"
Chris@82 454
Chris@82 455 static void n2fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 456 {
Chris@82 457 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 458 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 459 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 460 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 461 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 462 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 463 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 464 {
Chris@82 465 INT i;
Chris@82 466 const R *xi;
Chris@82 467 R *xo;
Chris@82 468 xi = ri;
Chris@82 469 xo = ro;
Chris@82 470 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@82 471 V T1T, T1W, T2K, T2x, T16, T1A, Tb, T1p, TT, T1v, TY, T1w, T27, T2a, T2b;
Chris@82 472 V T2H, T2O, TC, T1s, TH, T1t, T20, T23, T24, T2E, T2N, T2g, T2j, Tq, T1B;
Chris@82 473 V T19, T1q, T2A, T2L;
Chris@82 474 {
Chris@82 475 V T3, T1R, T15, T1S, T6, T1U, T9, T1V, T12, Ta;
Chris@82 476 {
Chris@82 477 V T1, T2, T13, T14;
Chris@82 478 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 479 T2 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 480 T3 = VSUB(T1, T2);
Chris@82 481 T1R = VADD(T1, T2);
Chris@82 482 T13 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 483 T14 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 484 T15 = VSUB(T13, T14);
Chris@82 485 T1S = VADD(T13, T14);
Chris@82 486 }
Chris@82 487 {
Chris@82 488 V T4, T5, T7, T8;
Chris@82 489 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 490 T5 = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 491 T6 = VSUB(T4, T5);
Chris@82 492 T1U = VADD(T4, T5);
Chris@82 493 T7 = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 494 T8 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 495 T9 = VSUB(T7, T8);
Chris@82 496 T1V = VADD(T7, T8);
Chris@82 497 }
Chris@82 498 T1T = VADD(T1R, T1S);
Chris@82 499 T1W = VADD(T1U, T1V);
Chris@82 500 T2K = VSUB(T1V, T1U);
Chris@82 501 T2x = VSUB(T1R, T1S);
Chris@82 502 T12 = VMUL(LDK(KP707106781), VSUB(T9, T6));
Chris@82 503 T16 = VSUB(T12, T15);
Chris@82 504 T1A = VADD(T15, T12);
Chris@82 505 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@82 506 Tb = VADD(T3, Ta);
Chris@82 507 T1p = VSUB(T3, Ta);
Chris@82 508 }
Chris@82 509 {
Chris@82 510 V TL, T25, TX, T26, TO, T28, TR, T29;
Chris@82 511 {
Chris@82 512 V TJ, TK, TV, TW;
Chris@82 513 TJ = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 514 TK = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 515 TL = VSUB(TJ, TK);
Chris@82 516 T25 = VADD(TJ, TK);
Chris@82 517 TV = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 518 TW = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 519 TX = VSUB(TV, TW);
Chris@82 520 T26 = VADD(TV, TW);
Chris@82 521 }
Chris@82 522 {
Chris@82 523 V TM, TN, TP, TQ;
Chris@82 524 TM = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 525 TN = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 526 TO = VSUB(TM, TN);
Chris@82 527 T28 = VADD(TM, TN);
Chris@82 528 TP = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 529 TQ = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 530 TR = VSUB(TP, TQ);
Chris@82 531 T29 = VADD(TP, TQ);
Chris@82 532 }
Chris@82 533 {
Chris@82 534 V TS, TU, T2F, T2G;
Chris@82 535 TS = VMUL(LDK(KP707106781), VADD(TO, TR));
Chris@82 536 TT = VADD(TL, TS);
Chris@82 537 T1v = VSUB(TL, TS);
Chris@82 538 TU = VMUL(LDK(KP707106781), VSUB(TR, TO));
Chris@82 539 TY = VSUB(TU, TX);
Chris@82 540 T1w = VADD(TX, TU);
Chris@82 541 T27 = VADD(T25, T26);
Chris@82 542 T2a = VADD(T28, T29);
Chris@82 543 T2b = VSUB(T27, T2a);
Chris@82 544 T2F = VSUB(T25, T26);
Chris@82 545 T2G = VSUB(T29, T28);
Chris@82 546 T2H = VFNMS(LDK(KP382683432), T2G, VMUL(LDK(KP923879532), T2F));
Chris@82 547 T2O = VFMA(LDK(KP382683432), T2F, VMUL(LDK(KP923879532), T2G));
Chris@82 548 }
Chris@82 549 }
Chris@82 550 {
Chris@82 551 V Tu, T1Y, TG, T1Z, Tx, T21, TA, T22;
Chris@82 552 {
Chris@82 553 V Ts, Tt, TE, TF;
Chris@82 554 Ts = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 555 Tt = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 556 Tu = VSUB(Ts, Tt);
Chris@82 557 T1Y = VADD(Ts, Tt);
Chris@82 558 TE = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 559 TF = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 560 TG = VSUB(TE, TF);
Chris@82 561 T1Z = VADD(TE, TF);
Chris@82 562 }
Chris@82 563 {
Chris@82 564 V Tv, Tw, Ty, Tz;
Chris@82 565 Tv = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 566 Tw = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 567 Tx = VSUB(Tv, Tw);
Chris@82 568 T21 = VADD(Tv, Tw);
Chris@82 569 Ty = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 570 Tz = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 571 TA = VSUB(Ty, Tz);
Chris@82 572 T22 = VADD(Ty, Tz);
Chris@82 573 }
Chris@82 574 {
Chris@82 575 V TB, TD, T2C, T2D;
Chris@82 576 TB = VMUL(LDK(KP707106781), VADD(Tx, TA));
Chris@82 577 TC = VADD(Tu, TB);
Chris@82 578 T1s = VSUB(Tu, TB);
Chris@82 579 TD = VMUL(LDK(KP707106781), VSUB(TA, Tx));
Chris@82 580 TH = VSUB(TD, TG);
Chris@82 581 T1t = VADD(TG, TD);
Chris@82 582 T20 = VADD(T1Y, T1Z);
Chris@82 583 T23 = VADD(T21, T22);
Chris@82 584 T24 = VSUB(T20, T23);
Chris@82 585 T2C = VSUB(T1Y, T1Z);
Chris@82 586 T2D = VSUB(T22, T21);
Chris@82 587 T2E = VFMA(LDK(KP923879532), T2C, VMUL(LDK(KP382683432), T2D));
Chris@82 588 T2N = VFNMS(LDK(KP382683432), T2C, VMUL(LDK(KP923879532), T2D));
Chris@82 589 }
Chris@82 590 }
Chris@82 591 {
Chris@82 592 V Te, T2h, To, T2f, Th, T2i, Tl, T2e, Ti, Tp;
Chris@82 593 {
Chris@82 594 V Tc, Td, Tm, Tn;
Chris@82 595 Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 596 Td = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 597 Te = VSUB(Tc, Td);
Chris@82 598 T2h = VADD(Tc, Td);
Chris@82 599 Tm = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 600 Tn = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 601 To = VSUB(Tm, Tn);
Chris@82 602 T2f = VADD(Tm, Tn);
Chris@82 603 }
Chris@82 604 {
Chris@82 605 V Tf, Tg, Tj, Tk;
Chris@82 606 Tf = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 607 Tg = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 608 Th = VSUB(Tf, Tg);
Chris@82 609 T2i = VADD(Tf, Tg);
Chris@82 610 Tj = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 611 Tk = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 612 Tl = VSUB(Tj, Tk);
Chris@82 613 T2e = VADD(Tj, Tk);
Chris@82 614 }
Chris@82 615 T2g = VADD(T2e, T2f);
Chris@82 616 T2j = VADD(T2h, T2i);
Chris@82 617 Ti = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@82 618 Tp = VFMA(LDK(KP923879532), Tl, VMUL(LDK(KP382683432), To));
Chris@82 619 Tq = VADD(Ti, Tp);
Chris@82 620 T1B = VSUB(Tp, Ti);
Chris@82 621 {
Chris@82 622 V T17, T18, T2y, T2z;
Chris@82 623 T17 = VFNMS(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@82 624 T18 = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@82 625 T19 = VSUB(T17, T18);
Chris@82 626 T1q = VADD(T18, T17);
Chris@82 627 T2y = VSUB(T2h, T2i);
Chris@82 628 T2z = VSUB(T2e, T2f);
Chris@82 629 T2A = VMUL(LDK(KP707106781), VADD(T2y, T2z));
Chris@82 630 T2L = VMUL(LDK(KP707106781), VSUB(T2z, T2y));
Chris@82 631 }
Chris@82 632 }
Chris@82 633 {
Chris@82 634 V T31, T32, T33, T34, T35, T36, T37, T38, T39, T3a, T3b, T3c;
Chris@82 635 {
Chris@82 636 V T2d, T2n, T2m, T2o;
Chris@82 637 {
Chris@82 638 V T1X, T2c, T2k, T2l;
Chris@82 639 T1X = VSUB(T1T, T1W);
Chris@82 640 T2c = VMUL(LDK(KP707106781), VADD(T24, T2b));
Chris@82 641 T2d = VADD(T1X, T2c);
Chris@82 642 T2n = VSUB(T1X, T2c);
Chris@82 643 T2k = VSUB(T2g, T2j);
Chris@82 644 T2l = VMUL(LDK(KP707106781), VSUB(T2b, T24));
Chris@82 645 T2m = VBYI(VADD(T2k, T2l));
Chris@82 646 T2o = VBYI(VSUB(T2l, T2k));
Chris@82 647 }
Chris@82 648 T31 = VSUB(T2d, T2m);
Chris@82 649 STM2(&(xo[56]), T31, ovs, &(xo[0]));
Chris@82 650 T32 = VADD(T2n, T2o);
Chris@82 651 STM2(&(xo[24]), T32, ovs, &(xo[0]));
Chris@82 652 T33 = VADD(T2d, T2m);
Chris@82 653 STM2(&(xo[8]), T33, ovs, &(xo[0]));
Chris@82 654 T34 = VSUB(T2n, T2o);
Chris@82 655 STM2(&(xo[40]), T34, ovs, &(xo[0]));
Chris@82 656 }
Chris@82 657 {
Chris@82 658 V T2r, T2v, T2u, T2w;
Chris@82 659 {
Chris@82 660 V T2p, T2q, T2s, T2t;
Chris@82 661 T2p = VADD(T1T, T1W);
Chris@82 662 T2q = VADD(T2j, T2g);
Chris@82 663 T2r = VADD(T2p, T2q);
Chris@82 664 T2v = VSUB(T2p, T2q);
Chris@82 665 T2s = VADD(T20, T23);
Chris@82 666 T2t = VADD(T27, T2a);
Chris@82 667 T2u = VADD(T2s, T2t);
Chris@82 668 T2w = VBYI(VSUB(T2t, T2s));
Chris@82 669 }
Chris@82 670 T35 = VSUB(T2r, T2u);
Chris@82 671 STM2(&(xo[32]), T35, ovs, &(xo[0]));
Chris@82 672 T36 = VADD(T2v, T2w);
Chris@82 673 STM2(&(xo[16]), T36, ovs, &(xo[0]));
Chris@82 674 T37 = VADD(T2r, T2u);
Chris@82 675 STM2(&(xo[0]), T37, ovs, &(xo[0]));
Chris@82 676 T38 = VSUB(T2v, T2w);
Chris@82 677 STM2(&(xo[48]), T38, ovs, &(xo[0]));
Chris@82 678 }
Chris@82 679 {
Chris@82 680 V T2V, T2Z, T2Y, T30;
Chris@82 681 {
Chris@82 682 V T2T, T2U, T2W, T2X;
Chris@82 683 T2T = VSUB(T2H, T2E);
Chris@82 684 T2U = VSUB(T2L, T2K);
Chris@82 685 T2V = VBYI(VSUB(T2T, T2U));
Chris@82 686 T2Z = VBYI(VADD(T2U, T2T));
Chris@82 687 T2W = VSUB(T2x, T2A);
Chris@82 688 T2X = VSUB(T2O, T2N);
Chris@82 689 T2Y = VSUB(T2W, T2X);
Chris@82 690 T30 = VADD(T2W, T2X);
Chris@82 691 }
Chris@82 692 T39 = VADD(T2V, T2Y);
Chris@82 693 STM2(&(xo[20]), T39, ovs, &(xo[0]));
Chris@82 694 T3a = VSUB(T30, T2Z);
Chris@82 695 STM2(&(xo[52]), T3a, ovs, &(xo[0]));
Chris@82 696 T3b = VSUB(T2Y, T2V);
Chris@82 697 STM2(&(xo[44]), T3b, ovs, &(xo[0]));
Chris@82 698 T3c = VADD(T2Z, T30);
Chris@82 699 STM2(&(xo[12]), T3c, ovs, &(xo[0]));
Chris@82 700 }
Chris@82 701 {
Chris@82 702 V T3d, T3e, T3f, T3g;
Chris@82 703 {
Chris@82 704 V T2J, T2R, T2Q, T2S;
Chris@82 705 {
Chris@82 706 V T2B, T2I, T2M, T2P;
Chris@82 707 T2B = VADD(T2x, T2A);
Chris@82 708 T2I = VADD(T2E, T2H);
Chris@82 709 T2J = VADD(T2B, T2I);
Chris@82 710 T2R = VSUB(T2B, T2I);
Chris@82 711 T2M = VADD(T2K, T2L);
Chris@82 712 T2P = VADD(T2N, T2O);
Chris@82 713 T2Q = VBYI(VADD(T2M, T2P));
Chris@82 714 T2S = VBYI(VSUB(T2P, T2M));
Chris@82 715 }
Chris@82 716 T3d = VSUB(T2J, T2Q);
Chris@82 717 STM2(&(xo[60]), T3d, ovs, &(xo[0]));
Chris@82 718 T3e = VADD(T2R, T2S);
Chris@82 719 STM2(&(xo[28]), T3e, ovs, &(xo[0]));
Chris@82 720 T3f = VADD(T2J, T2Q);
Chris@82 721 STM2(&(xo[4]), T3f, ovs, &(xo[0]));
Chris@82 722 T3g = VSUB(T2R, T2S);
Chris@82 723 STM2(&(xo[36]), T3g, ovs, &(xo[0]));
Chris@82 724 }
Chris@82 725 {
Chris@82 726 V T1r, T1C, T1M, T1K, T1F, T1N, T1y, T1J;
Chris@82 727 T1r = VADD(T1p, T1q);
Chris@82 728 T1C = VADD(T1A, T1B);
Chris@82 729 T1M = VSUB(T1p, T1q);
Chris@82 730 T1K = VSUB(T1B, T1A);
Chris@82 731 {
Chris@82 732 V T1D, T1E, T1u, T1x;
Chris@82 733 T1D = VFNMS(LDK(KP555570233), T1s, VMUL(LDK(KP831469612), T1t));
Chris@82 734 T1E = VFMA(LDK(KP555570233), T1v, VMUL(LDK(KP831469612), T1w));
Chris@82 735 T1F = VADD(T1D, T1E);
Chris@82 736 T1N = VSUB(T1E, T1D);
Chris@82 737 T1u = VFMA(LDK(KP831469612), T1s, VMUL(LDK(KP555570233), T1t));
Chris@82 738 T1x = VFNMS(LDK(KP555570233), T1w, VMUL(LDK(KP831469612), T1v));
Chris@82 739 T1y = VADD(T1u, T1x);
Chris@82 740 T1J = VSUB(T1x, T1u);
Chris@82 741 }
Chris@82 742 {
Chris@82 743 V T1z, T1G, T3h, T3i;
Chris@82 744 T1z = VADD(T1r, T1y);
Chris@82 745 T1G = VBYI(VADD(T1C, T1F));
Chris@82 746 T3h = VSUB(T1z, T1G);
Chris@82 747 STM2(&(xo[58]), T3h, ovs, &(xo[2]));
Chris@82 748 STN2(&(xo[56]), T31, T3h, ovs);
Chris@82 749 T3i = VADD(T1z, T1G);
Chris@82 750 STM2(&(xo[6]), T3i, ovs, &(xo[2]));
Chris@82 751 STN2(&(xo[4]), T3f, T3i, ovs);
Chris@82 752 }
Chris@82 753 {
Chris@82 754 V T1P, T1Q, T3j, T3k;
Chris@82 755 T1P = VBYI(VADD(T1K, T1J));
Chris@82 756 T1Q = VADD(T1M, T1N);
Chris@82 757 T3j = VADD(T1P, T1Q);
Chris@82 758 STM2(&(xo[10]), T3j, ovs, &(xo[2]));
Chris@82 759 STN2(&(xo[8]), T33, T3j, ovs);
Chris@82 760 T3k = VSUB(T1Q, T1P);
Chris@82 761 STM2(&(xo[54]), T3k, ovs, &(xo[2]));
Chris@82 762 STN2(&(xo[52]), T3a, T3k, ovs);
Chris@82 763 }
Chris@82 764 {
Chris@82 765 V T1H, T1I, T3l, T3m;
Chris@82 766 T1H = VSUB(T1r, T1y);
Chris@82 767 T1I = VBYI(VSUB(T1F, T1C));
Chris@82 768 T3l = VSUB(T1H, T1I);
Chris@82 769 STM2(&(xo[38]), T3l, ovs, &(xo[2]));
Chris@82 770 STN2(&(xo[36]), T3g, T3l, ovs);
Chris@82 771 T3m = VADD(T1H, T1I);
Chris@82 772 STM2(&(xo[26]), T3m, ovs, &(xo[2]));
Chris@82 773 STN2(&(xo[24]), T32, T3m, ovs);
Chris@82 774 }
Chris@82 775 {
Chris@82 776 V T1L, T1O, T3n, T3o;
Chris@82 777 T1L = VBYI(VSUB(T1J, T1K));
Chris@82 778 T1O = VSUB(T1M, T1N);
Chris@82 779 T3n = VADD(T1L, T1O);
Chris@82 780 STM2(&(xo[22]), T3n, ovs, &(xo[2]));
Chris@82 781 STN2(&(xo[20]), T39, T3n, ovs);
Chris@82 782 T3o = VSUB(T1O, T1L);
Chris@82 783 STM2(&(xo[42]), T3o, ovs, &(xo[2]));
Chris@82 784 STN2(&(xo[40]), T34, T3o, ovs);
Chris@82 785 }
Chris@82 786 }
Chris@82 787 {
Chris@82 788 V Tr, T1a, T1k, T1i, T1d, T1l, T10, T1h;
Chris@82 789 Tr = VADD(Tb, Tq);
Chris@82 790 T1a = VADD(T16, T19);
Chris@82 791 T1k = VSUB(Tb, Tq);
Chris@82 792 T1i = VSUB(T19, T16);
Chris@82 793 {
Chris@82 794 V T1b, T1c, TI, TZ;
Chris@82 795 T1b = VFNMS(LDK(KP195090322), TC, VMUL(LDK(KP980785280), TH));
Chris@82 796 T1c = VFMA(LDK(KP195090322), TT, VMUL(LDK(KP980785280), TY));
Chris@82 797 T1d = VADD(T1b, T1c);
Chris@82 798 T1l = VSUB(T1c, T1b);
Chris@82 799 TI = VFMA(LDK(KP980785280), TC, VMUL(LDK(KP195090322), TH));
Chris@82 800 TZ = VFNMS(LDK(KP195090322), TY, VMUL(LDK(KP980785280), TT));
Chris@82 801 T10 = VADD(TI, TZ);
Chris@82 802 T1h = VSUB(TZ, TI);
Chris@82 803 }
Chris@82 804 {
Chris@82 805 V T11, T1e, T3p, T3q;
Chris@82 806 T11 = VADD(Tr, T10);
Chris@82 807 T1e = VBYI(VADD(T1a, T1d));
Chris@82 808 T3p = VSUB(T11, T1e);
Chris@82 809 STM2(&(xo[62]), T3p, ovs, &(xo[2]));
Chris@82 810 STN2(&(xo[60]), T3d, T3p, ovs);
Chris@82 811 T3q = VADD(T11, T1e);
Chris@82 812 STM2(&(xo[2]), T3q, ovs, &(xo[2]));
Chris@82 813 STN2(&(xo[0]), T37, T3q, ovs);
Chris@82 814 }
Chris@82 815 {
Chris@82 816 V T1n, T1o, T3r, T3s;
Chris@82 817 T1n = VBYI(VADD(T1i, T1h));
Chris@82 818 T1o = VADD(T1k, T1l);
Chris@82 819 T3r = VADD(T1n, T1o);
Chris@82 820 STM2(&(xo[14]), T3r, ovs, &(xo[2]));
Chris@82 821 STN2(&(xo[12]), T3c, T3r, ovs);
Chris@82 822 T3s = VSUB(T1o, T1n);
Chris@82 823 STM2(&(xo[50]), T3s, ovs, &(xo[2]));
Chris@82 824 STN2(&(xo[48]), T38, T3s, ovs);
Chris@82 825 }
Chris@82 826 {
Chris@82 827 V T1f, T1g, T3t, T3u;
Chris@82 828 T1f = VSUB(Tr, T10);
Chris@82 829 T1g = VBYI(VSUB(T1d, T1a));
Chris@82 830 T3t = VSUB(T1f, T1g);
Chris@82 831 STM2(&(xo[34]), T3t, ovs, &(xo[2]));
Chris@82 832 STN2(&(xo[32]), T35, T3t, ovs);
Chris@82 833 T3u = VADD(T1f, T1g);
Chris@82 834 STM2(&(xo[30]), T3u, ovs, &(xo[2]));
Chris@82 835 STN2(&(xo[28]), T3e, T3u, ovs);
Chris@82 836 }
Chris@82 837 {
Chris@82 838 V T1j, T1m, T3v, T3w;
Chris@82 839 T1j = VBYI(VSUB(T1h, T1i));
Chris@82 840 T1m = VSUB(T1k, T1l);
Chris@82 841 T3v = VADD(T1j, T1m);
Chris@82 842 STM2(&(xo[18]), T3v, ovs, &(xo[2]));
Chris@82 843 STN2(&(xo[16]), T36, T3v, ovs);
Chris@82 844 T3w = VSUB(T1m, T1j);
Chris@82 845 STM2(&(xo[46]), T3w, ovs, &(xo[2]));
Chris@82 846 STN2(&(xo[44]), T3b, T3w, ovs);
Chris@82 847 }
Chris@82 848 }
Chris@82 849 }
Chris@82 850 }
Chris@82 851 }
Chris@82 852 }
Chris@82 853 VLEAVE();
Chris@82 854 }
Chris@82 855
Chris@82 856 static const kdft_desc desc = { 32, XSIMD_STRING("n2fv_32"), {170, 26, 16, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 857
Chris@82 858 void XSIMD(codelet_n2fv_32) (planner *p) {
Chris@82 859 X(kdft_register) (p, n2fv_32, &desc);
Chris@82 860 }
Chris@82 861
Chris@82 862 #endif