annotate src/fftw-3.3.8/dft/simd/common/t2sv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:11 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 16 -name t2sv_16 -include dft/simd/ts.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 196 FP additions, 134 FP multiplications,
Chris@82 32 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
Chris@82 33 * 90 stack variables, 3 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/ts.h"
Chris@82 36
Chris@82 37 static void t2sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 42 {
Chris@82 43 INT m;
Chris@82 44 for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 45 V T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
Chris@82 46 V Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
Chris@82 47 {
Chris@82 48 V TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
Chris@82 49 T2 = LDW(&(W[0]));
Chris@82 50 Tf = LDW(&(W[TWVL * 2]));
Chris@82 51 Tg = VMUL(T2, Tf);
Chris@82 52 TM = LDW(&(W[TWVL * 6]));
Chris@82 53 TN = VMUL(T2, TM);
Chris@82 54 TO = LDW(&(W[TWVL * 7]));
Chris@82 55 TS = VMUL(T2, TO);
Chris@82 56 T3 = LDW(&(W[TWVL * 4]));
Chris@82 57 T4 = VMUL(T2, T3);
Chris@82 58 Tp = VMUL(Tf, T3);
Chris@82 59 T6 = LDW(&(W[TWVL * 5]));
Chris@82 60 Ta = VMUL(T2, T6);
Chris@82 61 Tt = VMUL(Tf, T6);
Chris@82 62 T5 = LDW(&(W[TWVL * 1]));
Chris@82 63 Th = LDW(&(W[TWVL * 3]));
Chris@82 64 Tl = VMUL(T2, Th);
Chris@82 65 Tz = VFMA(T5, Th, Tg);
Chris@82 66 Ti = VFNMS(T5, Th, Tg);
Chris@82 67 T7 = VFMA(T5, T6, T4);
Chris@82 68 TZ = VFNMS(Th, T3, Tt);
Chris@82 69 TT = VFNMS(T5, TM, TS);
Chris@82 70 Tq = VFNMS(Th, T6, Tp);
Chris@82 71 TW = VFMA(Th, T6, Tp);
Chris@82 72 Tb = VFNMS(T5, T3, Ta);
Chris@82 73 Tu = VFMA(Th, T3, Tt);
Chris@82 74 TP = VFMA(T5, TO, TN);
Chris@82 75 TI = VFMA(T5, T3, Ta);
Chris@82 76 TF = VFNMS(T5, T6, T4);
Chris@82 77 {
Chris@82 78 V T1y, T1C, T1e, T1i;
Chris@82 79 T1y = VMUL(Tz, T3);
Chris@82 80 T1C = VMUL(Tz, T6);
Chris@82 81 TC = VFNMS(T5, Tf, Tl);
Chris@82 82 T1z = VFMA(TC, T6, T1y);
Chris@82 83 T1O = VFMA(TC, T3, T1C);
Chris@82 84 T1D = VFNMS(TC, T3, T1C);
Chris@82 85 T1L = VFNMS(TC, T6, T1y);
Chris@82 86 T1e = VMUL(Ti, T3);
Chris@82 87 T1i = VMUL(Ti, T6);
Chris@82 88 Tm = VFMA(T5, Tf, Tl);
Chris@82 89 T1f = VFMA(Tm, T6, T1e);
Chris@82 90 T1p = VFMA(Tm, T3, T1i);
Chris@82 91 T1j = VFNMS(Tm, T3, T1i);
Chris@82 92 T1m = VFNMS(Tm, T6, T1e);
Chris@82 93 }
Chris@82 94 }
Chris@82 95 {
Chris@82 96 V Te, T1U, T3A, T3L, T1G, T2D, T2A, T3h, T1R, T2B, T2I, T3i, Tx, T3M, T1Z;
Chris@82 97 V T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28;
Chris@82 98 V T2d, T38;
Chris@82 99 {
Chris@82 100 V T1, T3z, T8, T9, Tc, T3x, Td, T3y;
Chris@82 101 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 102 T3z = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 103 T8 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
Chris@82 104 T9 = VMUL(T7, T8);
Chris@82 105 Tc = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
Chris@82 106 T3x = VMUL(T7, Tc);
Chris@82 107 Td = VFMA(Tb, Tc, T9);
Chris@82 108 Te = VADD(T1, Td);
Chris@82 109 T1U = VSUB(T1, Td);
Chris@82 110 T3y = VFNMS(Tb, T8, T3x);
Chris@82 111 T3A = VADD(T3y, T3z);
Chris@82 112 T3L = VSUB(T3z, T3y);
Chris@82 113 }
Chris@82 114 {
Chris@82 115 V T1u, T1v, T1w, T2w, T1A, T1B, T1E, T2y;
Chris@82 116 T1u = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
Chris@82 117 T1v = VMUL(TM, T1u);
Chris@82 118 T1w = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
Chris@82 119 T2w = VMUL(TM, T1w);
Chris@82 120 T1A = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@82 121 T1B = VMUL(T1z, T1A);
Chris@82 122 T1E = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@82 123 T2y = VMUL(T1z, T1E);
Chris@82 124 {
Chris@82 125 V T1x, T1F, T2x, T2z;
Chris@82 126 T1x = VFMA(TO, T1w, T1v);
Chris@82 127 T1F = VFMA(T1D, T1E, T1B);
Chris@82 128 T1G = VADD(T1x, T1F);
Chris@82 129 T2D = VSUB(T1x, T1F);
Chris@82 130 T2x = VFNMS(TO, T1u, T2w);
Chris@82 131 T2z = VFNMS(T1D, T1A, T2y);
Chris@82 132 T2A = VSUB(T2x, T2z);
Chris@82 133 T3h = VADD(T2x, T2z);
Chris@82 134 }
Chris@82 135 }
Chris@82 136 {
Chris@82 137 V T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G;
Chris@82 138 T1H = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 139 T1I = VMUL(Tf, T1H);
Chris@82 140 T1J = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 141 T2E = VMUL(Tf, T1J);
Chris@82 142 T1M = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
Chris@82 143 T1N = VMUL(T1L, T1M);
Chris@82 144 T1P = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
Chris@82 145 T2G = VMUL(T1L, T1P);
Chris@82 146 {
Chris@82 147 V T1K, T1Q, T2F, T2H;
Chris@82 148 T1K = VFMA(Th, T1J, T1I);
Chris@82 149 T1Q = VFMA(T1O, T1P, T1N);
Chris@82 150 T1R = VADD(T1K, T1Q);
Chris@82 151 T2B = VSUB(T1K, T1Q);
Chris@82 152 T2F = VFNMS(Th, T1H, T2E);
Chris@82 153 T2H = VFNMS(T1O, T1M, T2G);
Chris@82 154 T2I = VSUB(T2F, T2H);
Chris@82 155 T3i = VADD(T2F, T2H);
Chris@82 156 }
Chris@82 157 }
Chris@82 158 {
Chris@82 159 V Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
Chris@82 160 Tj = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@82 161 Tk = VMUL(Ti, Tj);
Chris@82 162 Tn = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@82 163 T1V = VMUL(Ti, Tn);
Chris@82 164 Tr = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
Chris@82 165 Ts = VMUL(Tq, Tr);
Chris@82 166 Tv = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
Chris@82 167 T1X = VMUL(Tq, Tv);
Chris@82 168 {
Chris@82 169 V To, Tw, T1W, T1Y;
Chris@82 170 To = VFMA(Tm, Tn, Tk);
Chris@82 171 Tw = VFMA(Tu, Tv, Ts);
Chris@82 172 Tx = VADD(To, Tw);
Chris@82 173 T3M = VSUB(To, Tw);
Chris@82 174 T1W = VFNMS(Tm, Tj, T1V);
Chris@82 175 T1Y = VFNMS(Tu, Tr, T1X);
Chris@82 176 T1Z = VSUB(T1W, T1Y);
Chris@82 177 T3w = VADD(T1W, T1Y);
Chris@82 178 }
Chris@82 179 }
Chris@82 180 {
Chris@82 181 V TA, TB, TD, T21, TG, TH, TJ, T23;
Chris@82 182 TA = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 183 TB = VMUL(Tz, TA);
Chris@82 184 TD = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 185 T21 = VMUL(Tz, TD);
Chris@82 186 TG = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
Chris@82 187 TH = VMUL(TF, TG);
Chris@82 188 TJ = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
Chris@82 189 T23 = VMUL(TF, TJ);
Chris@82 190 {
Chris@82 191 V TE, TK, T22, T24;
Chris@82 192 TE = VFMA(TC, TD, TB);
Chris@82 193 TK = VFMA(TI, TJ, TH);
Chris@82 194 TL = VADD(TE, TK);
Chris@82 195 T26 = VSUB(TE, TK);
Chris@82 196 T22 = VFNMS(TC, TA, T21);
Chris@82 197 T24 = VFNMS(TI, TG, T23);
Chris@82 198 T25 = VSUB(T22, T24);
Chris@82 199 T37 = VADD(T22, T24);
Chris@82 200 }
Chris@82 201 }
Chris@82 202 {
Chris@82 203 V T15, T16, T17, T2h, T19, T1a, T1b, T2j;
Chris@82 204 T15 = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 205 T16 = VMUL(T2, T15);
Chris@82 206 T17 = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 207 T2h = VMUL(T2, T17);
Chris@82 208 T19 = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
Chris@82 209 T1a = VMUL(T3, T19);
Chris@82 210 T1b = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
Chris@82 211 T2j = VMUL(T3, T1b);
Chris@82 212 {
Chris@82 213 V T18, T1c, T2i, T2k;
Chris@82 214 T18 = VFMA(T5, T17, T16);
Chris@82 215 T1c = VFMA(T6, T1b, T1a);
Chris@82 216 T1d = VADD(T18, T1c);
Chris@82 217 T2o = VSUB(T18, T1c);
Chris@82 218 T2i = VFNMS(T5, T15, T2h);
Chris@82 219 T2k = VFNMS(T6, T19, T2j);
Chris@82 220 T2l = VSUB(T2i, T2k);
Chris@82 221 T3c = VADD(T2i, T2k);
Chris@82 222 }
Chris@82 223 }
Chris@82 224 {
Chris@82 225 V T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r;
Chris@82 226 T1g = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@82 227 T1h = VMUL(T1f, T1g);
Chris@82 228 T1k = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@82 229 T2p = VMUL(T1f, T1k);
Chris@82 230 T1n = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
Chris@82 231 T1o = VMUL(T1m, T1n);
Chris@82 232 T1q = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
Chris@82 233 T2r = VMUL(T1m, T1q);
Chris@82 234 {
Chris@82 235 V T1l, T1r, T2q, T2s;
Chris@82 236 T1l = VFMA(T1j, T1k, T1h);
Chris@82 237 T1r = VFMA(T1p, T1q, T1o);
Chris@82 238 T1s = VADD(T1l, T1r);
Chris@82 239 T2m = VSUB(T1l, T1r);
Chris@82 240 T2q = VFNMS(T1j, T1g, T2p);
Chris@82 241 T2s = VFNMS(T1p, T1n, T2r);
Chris@82 242 T2t = VSUB(T2q, T2s);
Chris@82 243 T3d = VADD(T2q, T2s);
Chris@82 244 }
Chris@82 245 }
Chris@82 246 {
Chris@82 247 V TQ, TR, TU, T29, TX, TY, T10, T2b;
Chris@82 248 TQ = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
Chris@82 249 TR = VMUL(TP, TQ);
Chris@82 250 TU = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
Chris@82 251 T29 = VMUL(TP, TU);
Chris@82 252 TX = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@82 253 TY = VMUL(TW, TX);
Chris@82 254 T10 = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@82 255 T2b = VMUL(TW, T10);
Chris@82 256 {
Chris@82 257 V TV, T11, T2a, T2c;
Chris@82 258 TV = VFMA(TT, TU, TR);
Chris@82 259 T11 = VFMA(TZ, T10, TY);
Chris@82 260 T12 = VADD(TV, T11);
Chris@82 261 T28 = VSUB(TV, T11);
Chris@82 262 T2a = VFNMS(TT, TQ, T29);
Chris@82 263 T2c = VFNMS(TZ, TX, T2b);
Chris@82 264 T2d = VSUB(T2a, T2c);
Chris@82 265 T38 = VADD(T2a, T2c);
Chris@82 266 }
Chris@82 267 }
Chris@82 268 {
Chris@82 269 V T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
Chris@82 270 {
Chris@82 271 V Ty, T13, T3v, T3B;
Chris@82 272 Ty = VADD(Te, Tx);
Chris@82 273 T13 = VADD(TL, T12);
Chris@82 274 T14 = VADD(Ty, T13);
Chris@82 275 T3q = VSUB(Ty, T13);
Chris@82 276 T3v = VADD(T37, T38);
Chris@82 277 T3B = VADD(T3w, T3A);
Chris@82 278 T3C = VADD(T3v, T3B);
Chris@82 279 T3E = VSUB(T3B, T3v);
Chris@82 280 }
Chris@82 281 {
Chris@82 282 V T1t, T1S, T3r, T3s;
Chris@82 283 T1t = VADD(T1d, T1s);
Chris@82 284 T1S = VADD(T1G, T1R);
Chris@82 285 T1T = VADD(T1t, T1S);
Chris@82 286 T3D = VSUB(T1S, T1t);
Chris@82 287 T3r = VADD(T3c, T3d);
Chris@82 288 T3s = VADD(T3h, T3i);
Chris@82 289 T3t = VSUB(T3r, T3s);
Chris@82 290 T3u = VADD(T3r, T3s);
Chris@82 291 }
Chris@82 292 ST(&(ri[WS(rs, 8)]), VSUB(T14, T1T), ms, &(ri[0]));
Chris@82 293 ST(&(ii[WS(rs, 8)]), VSUB(T3C, T3u), ms, &(ii[0]));
Chris@82 294 ST(&(ri[0]), VADD(T14, T1T), ms, &(ri[0]));
Chris@82 295 ST(&(ii[0]), VADD(T3u, T3C), ms, &(ii[0]));
Chris@82 296 ST(&(ri[WS(rs, 12)]), VSUB(T3q, T3t), ms, &(ri[0]));
Chris@82 297 ST(&(ii[WS(rs, 12)]), VSUB(T3E, T3D), ms, &(ii[0]));
Chris@82 298 ST(&(ri[WS(rs, 4)]), VADD(T3q, T3t), ms, &(ri[0]));
Chris@82 299 ST(&(ii[WS(rs, 4)]), VADD(T3D, T3E), ms, &(ii[0]));
Chris@82 300 }
Chris@82 301 {
Chris@82 302 V T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
Chris@82 303 {
Chris@82 304 V T36, T39, T3F, T3G;
Chris@82 305 T36 = VSUB(Te, Tx);
Chris@82 306 T39 = VSUB(T37, T38);
Chris@82 307 T3a = VADD(T36, T39);
Chris@82 308 T3m = VSUB(T36, T39);
Chris@82 309 T3F = VSUB(T12, TL);
Chris@82 310 T3G = VSUB(T3A, T3w);
Chris@82 311 T3H = VADD(T3F, T3G);
Chris@82 312 T3J = VSUB(T3G, T3F);
Chris@82 313 }
Chris@82 314 {
Chris@82 315 V T3b, T3e, T3g, T3j;
Chris@82 316 T3b = VSUB(T1d, T1s);
Chris@82 317 T3e = VSUB(T3c, T3d);
Chris@82 318 T3f = VADD(T3b, T3e);
Chris@82 319 T3n = VSUB(T3e, T3b);
Chris@82 320 T3g = VSUB(T1G, T1R);
Chris@82 321 T3j = VSUB(T3h, T3i);
Chris@82 322 T3k = VSUB(T3g, T3j);
Chris@82 323 T3o = VADD(T3g, T3j);
Chris@82 324 }
Chris@82 325 {
Chris@82 326 V T3l, T3I, T3p, T3K;
Chris@82 327 T3l = VADD(T3f, T3k);
Chris@82 328 ST(&(ri[WS(rs, 10)]), VFNMS(LDK(KP707106781), T3l, T3a), ms, &(ri[0]));
Chris@82 329 ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP707106781), T3l, T3a), ms, &(ri[0]));
Chris@82 330 T3I = VADD(T3n, T3o);
Chris@82 331 ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP707106781), T3I, T3H), ms, &(ii[0]));
Chris@82 332 ST(&(ii[WS(rs, 10)]), VFNMS(LDK(KP707106781), T3I, T3H), ms, &(ii[0]));
Chris@82 333 T3p = VSUB(T3n, T3o);
Chris@82 334 ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3p, T3m), ms, &(ri[0]));
Chris@82 335 ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP707106781), T3p, T3m), ms, &(ri[0]));
Chris@82 336 T3K = VSUB(T3k, T3f);
Chris@82 337 ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP707106781), T3K, T3J), ms, &(ii[0]));
Chris@82 338 ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3K, T3J), ms, &(ii[0]));
Chris@82 339 }
Chris@82 340 }
Chris@82 341 {
Chris@82 342 V T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K;
Chris@82 343 V T2O;
Chris@82 344 {
Chris@82 345 V T27, T2e, T2n, T2u;
Chris@82 346 T20 = VSUB(T1U, T1Z);
Chris@82 347 T3N = VSUB(T3L, T3M);
Chris@82 348 T3T = VADD(T3M, T3L);
Chris@82 349 T2Q = VADD(T1U, T1Z);
Chris@82 350 T27 = VSUB(T25, T26);
Chris@82 351 T2e = VADD(T28, T2d);
Chris@82 352 T2f = VSUB(T27, T2e);
Chris@82 353 T3O = VADD(T27, T2e);
Chris@82 354 {
Chris@82 355 V T2Y, T2Z, T2R, T2S;
Chris@82 356 T2Y = VADD(T2D, T2I);
Chris@82 357 T2Z = VSUB(T2A, T2B);
Chris@82 358 T30 = VFNMS(LDK(KP414213562), T2Z, T2Y);
Chris@82 359 T34 = VFMA(LDK(KP414213562), T2Y, T2Z);
Chris@82 360 T2R = VADD(T26, T25);
Chris@82 361 T2S = VSUB(T28, T2d);
Chris@82 362 T2T = VADD(T2R, T2S);
Chris@82 363 T3U = VSUB(T2S, T2R);
Chris@82 364 }
Chris@82 365 T2n = VADD(T2l, T2m);
Chris@82 366 T2u = VSUB(T2o, T2t);
Chris@82 367 T2v = VFMA(LDK(KP414213562), T2u, T2n);
Chris@82 368 T2N = VFNMS(LDK(KP414213562), T2n, T2u);
Chris@82 369 {
Chris@82 370 V T2V, T2W, T2C, T2J;
Chris@82 371 T2V = VADD(T2o, T2t);
Chris@82 372 T2W = VSUB(T2l, T2m);
Chris@82 373 T2X = VFMA(LDK(KP414213562), T2W, T2V);
Chris@82 374 T33 = VFNMS(LDK(KP414213562), T2V, T2W);
Chris@82 375 T2C = VADD(T2A, T2B);
Chris@82 376 T2J = VSUB(T2D, T2I);
Chris@82 377 T2K = VFNMS(LDK(KP414213562), T2J, T2C);
Chris@82 378 T2O = VFMA(LDK(KP414213562), T2C, T2J);
Chris@82 379 }
Chris@82 380 }
Chris@82 381 {
Chris@82 382 V T2g, T2L, T3V, T3W;
Chris@82 383 T2g = VFMA(LDK(KP707106781), T2f, T20);
Chris@82 384 T2L = VSUB(T2v, T2K);
Chris@82 385 ST(&(ri[WS(rs, 11)]), VFNMS(LDK(KP923879532), T2L, T2g), ms, &(ri[WS(rs, 1)]));
Chris@82 386 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP923879532), T2L, T2g), ms, &(ri[WS(rs, 1)]));
Chris@82 387 T3V = VFMA(LDK(KP707106781), T3U, T3T);
Chris@82 388 T3W = VSUB(T2O, T2N);
Chris@82 389 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP923879532), T3W, T3V), ms, &(ii[WS(rs, 1)]));
Chris@82 390 ST(&(ii[WS(rs, 11)]), VFNMS(LDK(KP923879532), T3W, T3V), ms, &(ii[WS(rs, 1)]));
Chris@82 391 }
Chris@82 392 {
Chris@82 393 V T2M, T2P, T3X, T3Y;
Chris@82 394 T2M = VFNMS(LDK(KP707106781), T2f, T20);
Chris@82 395 T2P = VADD(T2N, T2O);
Chris@82 396 ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP923879532), T2P, T2M), ms, &(ri[WS(rs, 1)]));
Chris@82 397 ST(&(ri[WS(rs, 15)]), VFMA(LDK(KP923879532), T2P, T2M), ms, &(ri[WS(rs, 1)]));
Chris@82 398 T3X = VFNMS(LDK(KP707106781), T3U, T3T);
Chris@82 399 T3Y = VADD(T2v, T2K);
Chris@82 400 ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP923879532), T3Y, T3X), ms, &(ii[WS(rs, 1)]));
Chris@82 401 ST(&(ii[WS(rs, 15)]), VFMA(LDK(KP923879532), T3Y, T3X), ms, &(ii[WS(rs, 1)]));
Chris@82 402 }
Chris@82 403 {
Chris@82 404 V T2U, T31, T3P, T3Q;
Chris@82 405 T2U = VFMA(LDK(KP707106781), T2T, T2Q);
Chris@82 406 T31 = VADD(T2X, T30);
Chris@82 407 ST(&(ri[WS(rs, 9)]), VFNMS(LDK(KP923879532), T31, T2U), ms, &(ri[WS(rs, 1)]));
Chris@82 408 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP923879532), T31, T2U), ms, &(ri[WS(rs, 1)]));
Chris@82 409 T3P = VFMA(LDK(KP707106781), T3O, T3N);
Chris@82 410 T3Q = VADD(T33, T34);
Chris@82 411 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP923879532), T3Q, T3P), ms, &(ii[WS(rs, 1)]));
Chris@82 412 ST(&(ii[WS(rs, 9)]), VFNMS(LDK(KP923879532), T3Q, T3P), ms, &(ii[WS(rs, 1)]));
Chris@82 413 }
Chris@82 414 {
Chris@82 415 V T32, T35, T3R, T3S;
Chris@82 416 T32 = VFNMS(LDK(KP707106781), T2T, T2Q);
Chris@82 417 T35 = VSUB(T33, T34);
Chris@82 418 ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP923879532), T35, T32), ms, &(ri[WS(rs, 1)]));
Chris@82 419 ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP923879532), T35, T32), ms, &(ri[WS(rs, 1)]));
Chris@82 420 T3R = VFNMS(LDK(KP707106781), T3O, T3N);
Chris@82 421 T3S = VSUB(T30, T2X);
Chris@82 422 ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP923879532), T3S, T3R), ms, &(ii[WS(rs, 1)]));
Chris@82 423 ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP923879532), T3S, T3R), ms, &(ii[WS(rs, 1)]));
Chris@82 424 }
Chris@82 425 }
Chris@82 426 }
Chris@82 427 }
Chris@82 428 }
Chris@82 429 VLEAVE();
Chris@82 430 }
Chris@82 431
Chris@82 432 static const tw_instr twinstr[] = {
Chris@82 433 VTW(0, 1),
Chris@82 434 VTW(0, 3),
Chris@82 435 VTW(0, 9),
Chris@82 436 VTW(0, 15),
Chris@82 437 {TW_NEXT, (2 * VL), 0}
Chris@82 438 };
Chris@82 439
Chris@82 440 static const ct_desc desc = { 16, XSIMD_STRING("t2sv_16"), twinstr, &GENUS, {104, 42, 92, 0}, 0, 0, 0 };
Chris@82 441
Chris@82 442 void XSIMD(codelet_t2sv_16) (planner *p) {
Chris@82 443 X(kdft_dit_register) (p, t2sv_16, &desc);
Chris@82 444 }
Chris@82 445 #else
Chris@82 446
Chris@82 447 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 16 -name t2sv_16 -include dft/simd/ts.h */
Chris@82 448
Chris@82 449 /*
Chris@82 450 * This function contains 196 FP additions, 108 FP multiplications,
Chris@82 451 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
Chris@82 452 * 82 stack variables, 3 constants, and 64 memory accesses
Chris@82 453 */
Chris@82 454 #include "dft/simd/ts.h"
Chris@82 455
Chris@82 456 static void t2sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 457 {
Chris@82 458 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 459 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 460 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 461 {
Chris@82 462 INT m;
Chris@82 463 for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 464 V T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
Chris@82 465 V Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
Chris@82 466 {
Chris@82 467 V T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
Chris@82 468 {
Chris@82 469 V Th, Tn, Tj, Tm;
Chris@82 470 T2 = LDW(&(W[0]));
Chris@82 471 T5 = LDW(&(W[TWVL * 1]));
Chris@82 472 Tg = LDW(&(W[TWVL * 2]));
Chris@82 473 Ti = LDW(&(W[TWVL * 3]));
Chris@82 474 Th = VMUL(T2, Tg);
Chris@82 475 Tn = VMUL(T5, Tg);
Chris@82 476 Tj = VMUL(T5, Ti);
Chris@82 477 Tm = VMUL(T2, Ti);
Chris@82 478 Tk = VSUB(Th, Tj);
Chris@82 479 To = VADD(Tm, Tn);
Chris@82 480 TE = VSUB(Tm, Tn);
Chris@82 481 TC = VADD(Th, Tj);
Chris@82 482 T6 = LDW(&(W[TWVL * 5]));
Chris@82 483 T7 = VMUL(T5, T6);
Chris@82 484 Tv = VMUL(Tg, T6);
Chris@82 485 Ta = VMUL(T2, T6);
Chris@82 486 Ts = VMUL(Ti, T6);
Chris@82 487 T3 = LDW(&(W[TWVL * 4]));
Chris@82 488 T4 = VMUL(T2, T3);
Chris@82 489 Tw = VMUL(Ti, T3);
Chris@82 490 Tb = VMUL(T5, T3);
Chris@82 491 Tr = VMUL(Tg, T3);
Chris@82 492 }
Chris@82 493 T8 = VADD(T4, T7);
Chris@82 494 TW = VSUB(Tv, Tw);
Chris@82 495 TJ = VADD(Ta, Tb);
Chris@82 496 Tt = VSUB(Tr, Ts);
Chris@82 497 TU = VADD(Tr, Ts);
Chris@82 498 Tc = VSUB(Ta, Tb);
Chris@82 499 Tx = VADD(Tv, Tw);
Chris@82 500 TH = VSUB(T4, T7);
Chris@82 501 TN = LDW(&(W[TWVL * 6]));
Chris@82 502 TO = LDW(&(W[TWVL * 7]));
Chris@82 503 TP = VFMA(T2, TN, VMUL(T5, TO));
Chris@82 504 TR = VFNMS(T5, TN, VMUL(T2, TO));
Chris@82 505 {
Chris@82 506 V T1d, T1e, T19, T1a;
Chris@82 507 T1d = VMUL(Tk, T6);
Chris@82 508 T1e = VMUL(To, T3);
Chris@82 509 T1f = VSUB(T1d, T1e);
Chris@82 510 T1k = VADD(T1d, T1e);
Chris@82 511 T19 = VMUL(Tk, T3);
Chris@82 512 T1a = VMUL(To, T6);
Chris@82 513 T1b = VADD(T19, T1a);
Chris@82 514 T1i = VSUB(T19, T1a);
Chris@82 515 }
Chris@82 516 {
Chris@82 517 V T1w, T1x, T1s, T1t;
Chris@82 518 T1w = VMUL(TC, T6);
Chris@82 519 T1x = VMUL(TE, T3);
Chris@82 520 T1y = VSUB(T1w, T1x);
Chris@82 521 T1H = VADD(T1w, T1x);
Chris@82 522 T1s = VMUL(TC, T3);
Chris@82 523 T1t = VMUL(TE, T6);
Chris@82 524 T1u = VADD(T1s, T1t);
Chris@82 525 T1F = VSUB(T1s, T1t);
Chris@82 526 }
Chris@82 527 }
Chris@82 528 {
Chris@82 529 V Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
Chris@82 530 V T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
Chris@82 531 V T2S, T2T, T28, T2A, T2d, T2B;
Chris@82 532 {
Chris@82 533 V T1, T3d, Te, T3c, T9, Td;
Chris@82 534 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 535 T3d = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 536 T9 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
Chris@82 537 Td = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
Chris@82 538 Te = VFMA(T8, T9, VMUL(Tc, Td));
Chris@82 539 T3c = VFNMS(Tc, T9, VMUL(T8, Td));
Chris@82 540 Tf = VADD(T1, Te);
Chris@82 541 T3r = VSUB(T3d, T3c);
Chris@82 542 T1N = VSUB(T1, Te);
Chris@82 543 T3e = VADD(T3c, T3d);
Chris@82 544 }
Chris@82 545 {
Chris@82 546 V Tq, T1O, Tz, T1P;
Chris@82 547 {
Chris@82 548 V Tl, Tp, Tu, Ty;
Chris@82 549 Tl = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@82 550 Tp = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@82 551 Tq = VFMA(Tk, Tl, VMUL(To, Tp));
Chris@82 552 T1O = VFNMS(To, Tl, VMUL(Tk, Tp));
Chris@82 553 Tu = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
Chris@82 554 Ty = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
Chris@82 555 Tz = VFMA(Tt, Tu, VMUL(Tx, Ty));
Chris@82 556 T1P = VFNMS(Tx, Tu, VMUL(Tt, Ty));
Chris@82 557 }
Chris@82 558 TA = VADD(Tq, Tz);
Chris@82 559 T3s = VSUB(Tq, Tz);
Chris@82 560 T1Q = VSUB(T1O, T1P);
Chris@82 561 T3b = VADD(T1O, T1P);
Chris@82 562 }
Chris@82 563 {
Chris@82 564 V TG, T1S, TL, T1T, T1U, T1V;
Chris@82 565 {
Chris@82 566 V TD, TF, TI, TK;
Chris@82 567 TD = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 568 TF = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 569 TG = VFMA(TC, TD, VMUL(TE, TF));
Chris@82 570 T1S = VFNMS(TE, TD, VMUL(TC, TF));
Chris@82 571 TI = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
Chris@82 572 TK = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
Chris@82 573 TL = VFMA(TH, TI, VMUL(TJ, TK));
Chris@82 574 T1T = VFNMS(TJ, TI, VMUL(TH, TK));
Chris@82 575 }
Chris@82 576 TM = VADD(TG, TL);
Chris@82 577 T2M = VADD(T1S, T1T);
Chris@82 578 T1U = VSUB(T1S, T1T);
Chris@82 579 T1V = VSUB(TG, TL);
Chris@82 580 T1W = VSUB(T1U, T1V);
Chris@82 581 T2w = VADD(T1V, T1U);
Chris@82 582 }
Chris@82 583 {
Chris@82 584 V TT, T1Y, TY, T1Z, T1X, T20;
Chris@82 585 {
Chris@82 586 V TQ, TS, TV, TX;
Chris@82 587 TQ = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
Chris@82 588 TS = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
Chris@82 589 TT = VFMA(TP, TQ, VMUL(TR, TS));
Chris@82 590 T1Y = VFNMS(TR, TQ, VMUL(TP, TS));
Chris@82 591 TV = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@82 592 TX = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@82 593 TY = VFMA(TU, TV, VMUL(TW, TX));
Chris@82 594 T1Z = VFNMS(TW, TV, VMUL(TU, TX));
Chris@82 595 }
Chris@82 596 TZ = VADD(TT, TY);
Chris@82 597 T2N = VADD(T1Y, T1Z);
Chris@82 598 T1X = VSUB(TT, TY);
Chris@82 599 T20 = VSUB(T1Y, T1Z);
Chris@82 600 T21 = VADD(T1X, T20);
Chris@82 601 T2x = VSUB(T1X, T20);
Chris@82 602 }
Chris@82 603 {
Chris@82 604 V T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
Chris@82 605 {
Chris@82 606 V T1p, T1q, T1G, T1I;
Chris@82 607 T1p = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
Chris@82 608 T1q = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
Chris@82 609 T1r = VFMA(TN, T1p, VMUL(TO, T1q));
Chris@82 610 T2k = VFNMS(TO, T1p, VMUL(TN, T1q));
Chris@82 611 T1G = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
Chris@82 612 T1I = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
Chris@82 613 T1J = VFMA(T1F, T1G, VMUL(T1H, T1I));
Chris@82 614 T2h = VFNMS(T1H, T1G, VMUL(T1F, T1I));
Chris@82 615 }
Chris@82 616 {
Chris@82 617 V T1v, T1z, T1C, T1D;
Chris@82 618 T1v = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@82 619 T1z = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@82 620 T1A = VFMA(T1u, T1v, VMUL(T1y, T1z));
Chris@82 621 T2l = VFNMS(T1y, T1v, VMUL(T1u, T1z));
Chris@82 622 T1C = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 623 T1D = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 624 T1E = VFMA(Tg, T1C, VMUL(Ti, T1D));
Chris@82 625 T2g = VFNMS(Ti, T1C, VMUL(Tg, T1D));
Chris@82 626 }
Chris@82 627 T1B = VADD(T1r, T1A);
Chris@82 628 T1K = VADD(T1E, T1J);
Chris@82 629 T2V = VSUB(T1B, T1K);
Chris@82 630 T2W = VADD(T2k, T2l);
Chris@82 631 T2X = VADD(T2g, T2h);
Chris@82 632 T2Y = VSUB(T2W, T2X);
Chris@82 633 {
Chris@82 634 V T2f, T2i, T2m, T2n;
Chris@82 635 T2f = VSUB(T1r, T1A);
Chris@82 636 T2i = VSUB(T2g, T2h);
Chris@82 637 T2j = VSUB(T2f, T2i);
Chris@82 638 T2D = VADD(T2f, T2i);
Chris@82 639 T2m = VSUB(T2k, T2l);
Chris@82 640 T2n = VSUB(T1E, T1J);
Chris@82 641 T2o = VADD(T2m, T2n);
Chris@82 642 T2E = VSUB(T2m, T2n);
Chris@82 643 }
Chris@82 644 }
Chris@82 645 {
Chris@82 646 V T14, T24, T1m, T2b, T17, T25, T1h, T2a;
Chris@82 647 {
Chris@82 648 V T12, T13, T1j, T1l;
Chris@82 649 T12 = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 650 T13 = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 651 T14 = VFMA(T2, T12, VMUL(T5, T13));
Chris@82 652 T24 = VFNMS(T5, T12, VMUL(T2, T13));
Chris@82 653 T1j = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
Chris@82 654 T1l = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
Chris@82 655 T1m = VFMA(T1i, T1j, VMUL(T1k, T1l));
Chris@82 656 T2b = VFNMS(T1k, T1j, VMUL(T1i, T1l));
Chris@82 657 }
Chris@82 658 {
Chris@82 659 V T15, T16, T1c, T1g;
Chris@82 660 T15 = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
Chris@82 661 T16 = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
Chris@82 662 T17 = VFMA(T3, T15, VMUL(T6, T16));
Chris@82 663 T25 = VFNMS(T6, T15, VMUL(T3, T16));
Chris@82 664 T1c = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@82 665 T1g = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@82 666 T1h = VFMA(T1b, T1c, VMUL(T1f, T1g));
Chris@82 667 T2a = VFNMS(T1f, T1c, VMUL(T1b, T1g));
Chris@82 668 }
Chris@82 669 T18 = VADD(T14, T17);
Chris@82 670 T1n = VADD(T1h, T1m);
Chris@82 671 T2Q = VSUB(T18, T1n);
Chris@82 672 T2R = VADD(T24, T25);
Chris@82 673 T2S = VADD(T2a, T2b);
Chris@82 674 T2T = VSUB(T2R, T2S);
Chris@82 675 {
Chris@82 676 V T26, T27, T29, T2c;
Chris@82 677 T26 = VSUB(T24, T25);
Chris@82 678 T27 = VSUB(T1h, T1m);
Chris@82 679 T28 = VADD(T26, T27);
Chris@82 680 T2A = VSUB(T26, T27);
Chris@82 681 T29 = VSUB(T14, T17);
Chris@82 682 T2c = VSUB(T2a, T2b);
Chris@82 683 T2d = VSUB(T29, T2c);
Chris@82 684 T2B = VADD(T29, T2c);
Chris@82 685 }
Chris@82 686 }
Chris@82 687 {
Chris@82 688 V T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
Chris@82 689 {
Chris@82 690 V T1R, T22, T3y, T3z;
Chris@82 691 T1R = VSUB(T1N, T1Q);
Chris@82 692 T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
Chris@82 693 T23 = VADD(T1R, T22);
Chris@82 694 T2r = VSUB(T1R, T22);
Chris@82 695 T3y = VMUL(LDK(KP707106781), VSUB(T2x, T2w));
Chris@82 696 T3z = VADD(T3s, T3r);
Chris@82 697 T3A = VADD(T3y, T3z);
Chris@82 698 T3C = VSUB(T3z, T3y);
Chris@82 699 }
Chris@82 700 {
Chris@82 701 V T2e, T2p, T2s, T2t;
Chris@82 702 T2e = VFMA(LDK(KP923879532), T28, VMUL(LDK(KP382683432), T2d));
Chris@82 703 T2p = VFNMS(LDK(KP923879532), T2o, VMUL(LDK(KP382683432), T2j));
Chris@82 704 T2q = VADD(T2e, T2p);
Chris@82 705 T3B = VSUB(T2p, T2e);
Chris@82 706 T2s = VFNMS(LDK(KP923879532), T2d, VMUL(LDK(KP382683432), T28));
Chris@82 707 T2t = VFMA(LDK(KP382683432), T2o, VMUL(LDK(KP923879532), T2j));
Chris@82 708 T2u = VSUB(T2s, T2t);
Chris@82 709 T3x = VADD(T2s, T2t);
Chris@82 710 }
Chris@82 711 ST(&(ri[WS(rs, 11)]), VSUB(T23, T2q), ms, &(ri[WS(rs, 1)]));
Chris@82 712 ST(&(ii[WS(rs, 11)]), VSUB(T3A, T3x), ms, &(ii[WS(rs, 1)]));
Chris@82 713 ST(&(ri[WS(rs, 3)]), VADD(T23, T2q), ms, &(ri[WS(rs, 1)]));
Chris@82 714 ST(&(ii[WS(rs, 3)]), VADD(T3x, T3A), ms, &(ii[WS(rs, 1)]));
Chris@82 715 ST(&(ri[WS(rs, 15)]), VSUB(T2r, T2u), ms, &(ri[WS(rs, 1)]));
Chris@82 716 ST(&(ii[WS(rs, 15)]), VSUB(T3C, T3B), ms, &(ii[WS(rs, 1)]));
Chris@82 717 ST(&(ri[WS(rs, 7)]), VADD(T2r, T2u), ms, &(ri[WS(rs, 1)]));
Chris@82 718 ST(&(ii[WS(rs, 7)]), VADD(T3B, T3C), ms, &(ii[WS(rs, 1)]));
Chris@82 719 }
Chris@82 720 {
Chris@82 721 V T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
Chris@82 722 {
Chris@82 723 V T2L, T2O, T3k, T3l;
Chris@82 724 T2L = VSUB(Tf, TA);
Chris@82 725 T2O = VSUB(T2M, T2N);
Chris@82 726 T2P = VADD(T2L, T2O);
Chris@82 727 T31 = VSUB(T2L, T2O);
Chris@82 728 T3k = VSUB(TZ, TM);
Chris@82 729 T3l = VSUB(T3e, T3b);
Chris@82 730 T3m = VADD(T3k, T3l);
Chris@82 731 T3o = VSUB(T3l, T3k);
Chris@82 732 }
Chris@82 733 {
Chris@82 734 V T2U, T2Z, T32, T33;
Chris@82 735 T2U = VADD(T2Q, T2T);
Chris@82 736 T2Z = VSUB(T2V, T2Y);
Chris@82 737 T30 = VMUL(LDK(KP707106781), VADD(T2U, T2Z));
Chris@82 738 T3n = VMUL(LDK(KP707106781), VSUB(T2Z, T2U));
Chris@82 739 T32 = VSUB(T2T, T2Q);
Chris@82 740 T33 = VADD(T2V, T2Y);
Chris@82 741 T34 = VMUL(LDK(KP707106781), VSUB(T32, T33));
Chris@82 742 T3j = VMUL(LDK(KP707106781), VADD(T32, T33));
Chris@82 743 }
Chris@82 744 ST(&(ri[WS(rs, 10)]), VSUB(T2P, T30), ms, &(ri[0]));
Chris@82 745 ST(&(ii[WS(rs, 10)]), VSUB(T3m, T3j), ms, &(ii[0]));
Chris@82 746 ST(&(ri[WS(rs, 2)]), VADD(T2P, T30), ms, &(ri[0]));
Chris@82 747 ST(&(ii[WS(rs, 2)]), VADD(T3j, T3m), ms, &(ii[0]));
Chris@82 748 ST(&(ri[WS(rs, 14)]), VSUB(T31, T34), ms, &(ri[0]));
Chris@82 749 ST(&(ii[WS(rs, 14)]), VSUB(T3o, T3n), ms, &(ii[0]));
Chris@82 750 ST(&(ri[WS(rs, 6)]), VADD(T31, T34), ms, &(ri[0]));
Chris@82 751 ST(&(ii[WS(rs, 6)]), VADD(T3n, T3o), ms, &(ii[0]));
Chris@82 752 }
Chris@82 753 {
Chris@82 754 V T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
Chris@82 755 {
Chris@82 756 V T2v, T2y, T3q, T3t;
Chris@82 757 T2v = VADD(T1N, T1Q);
Chris@82 758 T2y = VMUL(LDK(KP707106781), VADD(T2w, T2x));
Chris@82 759 T2z = VADD(T2v, T2y);
Chris@82 760 T2H = VSUB(T2v, T2y);
Chris@82 761 T3q = VMUL(LDK(KP707106781), VADD(T1W, T21));
Chris@82 762 T3t = VSUB(T3r, T3s);
Chris@82 763 T3u = VADD(T3q, T3t);
Chris@82 764 T3w = VSUB(T3t, T3q);
Chris@82 765 }
Chris@82 766 {
Chris@82 767 V T2C, T2F, T2I, T2J;
Chris@82 768 T2C = VFMA(LDK(KP382683432), T2A, VMUL(LDK(KP923879532), T2B));
Chris@82 769 T2F = VFNMS(LDK(KP382683432), T2E, VMUL(LDK(KP923879532), T2D));
Chris@82 770 T2G = VADD(T2C, T2F);
Chris@82 771 T3v = VSUB(T2F, T2C);
Chris@82 772 T2I = VFNMS(LDK(KP382683432), T2B, VMUL(LDK(KP923879532), T2A));
Chris@82 773 T2J = VFMA(LDK(KP923879532), T2E, VMUL(LDK(KP382683432), T2D));
Chris@82 774 T2K = VSUB(T2I, T2J);
Chris@82 775 T3p = VADD(T2I, T2J);
Chris@82 776 }
Chris@82 777 ST(&(ri[WS(rs, 9)]), VSUB(T2z, T2G), ms, &(ri[WS(rs, 1)]));
Chris@82 778 ST(&(ii[WS(rs, 9)]), VSUB(T3u, T3p), ms, &(ii[WS(rs, 1)]));
Chris@82 779 ST(&(ri[WS(rs, 1)]), VADD(T2z, T2G), ms, &(ri[WS(rs, 1)]));
Chris@82 780 ST(&(ii[WS(rs, 1)]), VADD(T3p, T3u), ms, &(ii[WS(rs, 1)]));
Chris@82 781 ST(&(ri[WS(rs, 13)]), VSUB(T2H, T2K), ms, &(ri[WS(rs, 1)]));
Chris@82 782 ST(&(ii[WS(rs, 13)]), VSUB(T3w, T3v), ms, &(ii[WS(rs, 1)]));
Chris@82 783 ST(&(ri[WS(rs, 5)]), VADD(T2H, T2K), ms, &(ri[WS(rs, 1)]));
Chris@82 784 ST(&(ii[WS(rs, 5)]), VADD(T3v, T3w), ms, &(ii[WS(rs, 1)]));
Chris@82 785 }
Chris@82 786 {
Chris@82 787 V T11, T35, T3g, T3i, T1M, T3h, T38, T39;
Chris@82 788 {
Chris@82 789 V TB, T10, T3a, T3f;
Chris@82 790 TB = VADD(Tf, TA);
Chris@82 791 T10 = VADD(TM, TZ);
Chris@82 792 T11 = VADD(TB, T10);
Chris@82 793 T35 = VSUB(TB, T10);
Chris@82 794 T3a = VADD(T2M, T2N);
Chris@82 795 T3f = VADD(T3b, T3e);
Chris@82 796 T3g = VADD(T3a, T3f);
Chris@82 797 T3i = VSUB(T3f, T3a);
Chris@82 798 }
Chris@82 799 {
Chris@82 800 V T1o, T1L, T36, T37;
Chris@82 801 T1o = VADD(T18, T1n);
Chris@82 802 T1L = VADD(T1B, T1K);
Chris@82 803 T1M = VADD(T1o, T1L);
Chris@82 804 T3h = VSUB(T1L, T1o);
Chris@82 805 T36 = VADD(T2R, T2S);
Chris@82 806 T37 = VADD(T2W, T2X);
Chris@82 807 T38 = VSUB(T36, T37);
Chris@82 808 T39 = VADD(T36, T37);
Chris@82 809 }
Chris@82 810 ST(&(ri[WS(rs, 8)]), VSUB(T11, T1M), ms, &(ri[0]));
Chris@82 811 ST(&(ii[WS(rs, 8)]), VSUB(T3g, T39), ms, &(ii[0]));
Chris@82 812 ST(&(ri[0]), VADD(T11, T1M), ms, &(ri[0]));
Chris@82 813 ST(&(ii[0]), VADD(T39, T3g), ms, &(ii[0]));
Chris@82 814 ST(&(ri[WS(rs, 12)]), VSUB(T35, T38), ms, &(ri[0]));
Chris@82 815 ST(&(ii[WS(rs, 12)]), VSUB(T3i, T3h), ms, &(ii[0]));
Chris@82 816 ST(&(ri[WS(rs, 4)]), VADD(T35, T38), ms, &(ri[0]));
Chris@82 817 ST(&(ii[WS(rs, 4)]), VADD(T3h, T3i), ms, &(ii[0]));
Chris@82 818 }
Chris@82 819 }
Chris@82 820 }
Chris@82 821 }
Chris@82 822 VLEAVE();
Chris@82 823 }
Chris@82 824
Chris@82 825 static const tw_instr twinstr[] = {
Chris@82 826 VTW(0, 1),
Chris@82 827 VTW(0, 3),
Chris@82 828 VTW(0, 9),
Chris@82 829 VTW(0, 15),
Chris@82 830 {TW_NEXT, (2 * VL), 0}
Chris@82 831 };
Chris@82 832
Chris@82 833 static const ct_desc desc = { 16, XSIMD_STRING("t2sv_16"), twinstr, &GENUS, {156, 68, 40, 0}, 0, 0, 0 };
Chris@82 834
Chris@82 835 void XSIMD(codelet_t2sv_16) (planner *p) {
Chris@82 836 X(kdft_dit_register) (p, t2sv_16, &desc);
Chris@82 837 }
Chris@82 838 #endif