annotate src/fftw-3.3.3/dft/simd/common/t2sv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:39:26 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 16 -name t2sv_16 -include ts.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 196 FP additions, 134 FP multiplications,
Chris@10 32 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
Chris@10 33 * 120 stack variables, 3 constants, and 64 memory accesses
Chris@10 34 */
Chris@10 35 #include "ts.h"
Chris@10 36
Chris@10 37 static void t2sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 42 {
Chris@10 43 INT m;
Chris@10 44 for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@10 45 V T34, T30, T2N, T2v, T2M, T2g, T3V, T3X, T32, T2U, T33, T2X, T2O, T2K, T3P;
Chris@10 46 V T3R;
Chris@10 47 {
Chris@10 48 V T2, Tf, TM, TO, T3, T6, T5, Th;
Chris@10 49 T2 = LDW(&(W[0]));
Chris@10 50 Tf = LDW(&(W[TWVL * 2]));
Chris@10 51 TM = LDW(&(W[TWVL * 6]));
Chris@10 52 TO = LDW(&(W[TWVL * 7]));
Chris@10 53 T3 = LDW(&(W[TWVL * 4]));
Chris@10 54 T6 = LDW(&(W[TWVL * 5]));
Chris@10 55 T5 = LDW(&(W[TWVL * 1]));
Chris@10 56 Th = LDW(&(W[TWVL * 3]));
Chris@10 57 {
Chris@10 58 V TW, TZ, Te, T1U, T3A, T3L, T2D, T1G, T3h, T2A, T2B, T1R, T3i, T2I, Tx;
Chris@10 59 V T3M, T1Z, T3w, TL, T26, T25, T37, T1l, T2q, T1d, T2o, T2l, T3c, T1r, T2s;
Chris@10 60 V TX, T10, TV, T2a;
Chris@10 61 {
Chris@10 62 V Tz, TP, TT, Tq, TF, Tu, TI, Tm, TC, T1j, T1p, T1m, T1f, T1O, T1M;
Chris@10 63 V T1K, T2F, Tj, Tn, T1Q, T2G, Tk, T1V, Tr, Tv;
Chris@10 64 {
Chris@10 65 V T1, Ti, Tb, T3z, T8, Tc, T1u, T1D, T1L, T1z, T9, T3x, T1v, T1w, T1A;
Chris@10 66 V T1E;
Chris@10 67 {
Chris@10 68 V T7, T1i, T1e, T1C, T1y;
Chris@10 69 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@10 70 {
Chris@10 71 V Tg, TN, TS, Tp;
Chris@10 72 Tg = VMUL(T2, Tf);
Chris@10 73 TN = VMUL(T2, TM);
Chris@10 74 TS = VMUL(T2, TO);
Chris@10 75 Tp = VMUL(Tf, T3);
Chris@10 76 {
Chris@10 77 V T4, Tt, Ta, Tl;
Chris@10 78 T4 = VMUL(T2, T3);
Chris@10 79 Tt = VMUL(Tf, T6);
Chris@10 80 Ta = VMUL(T2, T6);
Chris@10 81 Tl = VMUL(T2, Th);
Chris@10 82 Ti = VFNMS(T5, Th, Tg);
Chris@10 83 Tz = VFMA(T5, Th, Tg);
Chris@10 84 TP = VFMA(T5, TO, TN);
Chris@10 85 TT = VFNMS(T5, TM, TS);
Chris@10 86 TW = VFMA(Th, T6, Tp);
Chris@10 87 Tq = VFNMS(Th, T6, Tp);
Chris@10 88 TF = VFNMS(T5, T6, T4);
Chris@10 89 T7 = VFMA(T5, T6, T4);
Chris@10 90 Tu = VFMA(Th, T3, Tt);
Chris@10 91 TZ = VFNMS(Th, T3, Tt);
Chris@10 92 TI = VFMA(T5, T3, Ta);
Chris@10 93 Tb = VFNMS(T5, T3, Ta);
Chris@10 94 Tm = VFMA(T5, Tf, Tl);
Chris@10 95 TC = VFNMS(T5, Tf, Tl);
Chris@10 96 T1i = VMUL(Ti, T6);
Chris@10 97 T1e = VMUL(Ti, T3);
Chris@10 98 T1C = VMUL(Tz, T6);
Chris@10 99 T1y = VMUL(Tz, T3);
Chris@10 100 T3z = LD(&(ii[0]), ms, &(ii[0]));
Chris@10 101 }
Chris@10 102 }
Chris@10 103 T8 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
Chris@10 104 Tc = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
Chris@10 105 T1u = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
Chris@10 106 T1j = VFNMS(Tm, T3, T1i);
Chris@10 107 T1p = VFMA(Tm, T3, T1i);
Chris@10 108 T1m = VFNMS(Tm, T6, T1e);
Chris@10 109 T1f = VFMA(Tm, T6, T1e);
Chris@10 110 T1D = VFNMS(TC, T3, T1C);
Chris@10 111 T1O = VFMA(TC, T3, T1C);
Chris@10 112 T1L = VFNMS(TC, T6, T1y);
Chris@10 113 T1z = VFMA(TC, T6, T1y);
Chris@10 114 T9 = VMUL(T7, T8);
Chris@10 115 T3x = VMUL(T7, Tc);
Chris@10 116 T1v = VMUL(TM, T1u);
Chris@10 117 T1w = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
Chris@10 118 T1A = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@10 119 T1E = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@10 120 }
Chris@10 121 {
Chris@10 122 V T1x, T2x, T1F, T2z, T1N, T1P;
Chris@10 123 {
Chris@10 124 V T1H, T1J, T1I, T2E;
Chris@10 125 {
Chris@10 126 V Td, T3y, T2w, T1B, T2y;
Chris@10 127 T1H = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@10 128 T1J = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@10 129 Td = VFMA(Tb, Tc, T9);
Chris@10 130 T3y = VFNMS(Tb, T8, T3x);
Chris@10 131 T1M = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
Chris@10 132 T1x = VFMA(TO, T1w, T1v);
Chris@10 133 T2w = VMUL(TM, T1w);
Chris@10 134 T1B = VMUL(T1z, T1A);
Chris@10 135 T2y = VMUL(T1z, T1E);
Chris@10 136 T1I = VMUL(Tf, T1H);
Chris@10 137 T2E = VMUL(Tf, T1J);
Chris@10 138 Te = VADD(T1, Td);
Chris@10 139 T1U = VSUB(T1, Td);
Chris@10 140 T3A = VADD(T3y, T3z);
Chris@10 141 T3L = VSUB(T3z, T3y);
Chris@10 142 T2x = VFNMS(TO, T1u, T2w);
Chris@10 143 T1F = VFMA(T1D, T1E, T1B);
Chris@10 144 T2z = VFNMS(T1D, T1A, T2y);
Chris@10 145 T1N = VMUL(T1L, T1M);
Chris@10 146 T1P = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
Chris@10 147 }
Chris@10 148 T1K = VFMA(Th, T1J, T1I);
Chris@10 149 T2F = VFNMS(Th, T1H, T2E);
Chris@10 150 }
Chris@10 151 Tj = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@10 152 Tn = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@10 153 T2D = VSUB(T1x, T1F);
Chris@10 154 T1G = VADD(T1x, T1F);
Chris@10 155 T3h = VADD(T2x, T2z);
Chris@10 156 T2A = VSUB(T2x, T2z);
Chris@10 157 T1Q = VFMA(T1O, T1P, T1N);
Chris@10 158 T2G = VMUL(T1L, T1P);
Chris@10 159 Tk = VMUL(Ti, Tj);
Chris@10 160 T1V = VMUL(Ti, Tn);
Chris@10 161 Tr = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
Chris@10 162 Tv = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
Chris@10 163 }
Chris@10 164 }
Chris@10 165 {
Chris@10 166 V TE, T22, T15, T17, TK, T16, T2h, T24, T19, T1b;
Chris@10 167 {
Chris@10 168 V To, T1W, TG, TJ, Tw, T1Y, TH, T23;
Chris@10 169 {
Chris@10 170 V TA, TD, TB, T21, T2H, Ts, T1X;
Chris@10 171 TA = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@10 172 TD = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@10 173 T2B = VSUB(T1K, T1Q);
Chris@10 174 T1R = VADD(T1K, T1Q);
Chris@10 175 T2H = VFNMS(T1O, T1M, T2G);
Chris@10 176 To = VFMA(Tm, Tn, Tk);
Chris@10 177 T1W = VFNMS(Tm, Tj, T1V);
Chris@10 178 Ts = VMUL(Tq, Tr);
Chris@10 179 T1X = VMUL(Tq, Tv);
Chris@10 180 TB = VMUL(Tz, TA);
Chris@10 181 T21 = VMUL(Tz, TD);
Chris@10 182 TG = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
Chris@10 183 T3i = VADD(T2F, T2H);
Chris@10 184 T2I = VSUB(T2F, T2H);
Chris@10 185 TJ = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
Chris@10 186 Tw = VFMA(Tu, Tv, Ts);
Chris@10 187 T1Y = VFNMS(Tu, Tr, T1X);
Chris@10 188 TE = VFMA(TC, TD, TB);
Chris@10 189 T22 = VFNMS(TC, TA, T21);
Chris@10 190 TH = VMUL(TF, TG);
Chris@10 191 }
Chris@10 192 T15 = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@10 193 T17 = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@10 194 T23 = VMUL(TF, TJ);
Chris@10 195 Tx = VADD(To, Tw);
Chris@10 196 T3M = VSUB(To, Tw);
Chris@10 197 T1Z = VSUB(T1W, T1Y);
Chris@10 198 T3w = VADD(T1W, T1Y);
Chris@10 199 TK = VFMA(TI, TJ, TH);
Chris@10 200 T16 = VMUL(T2, T15);
Chris@10 201 T2h = VMUL(T2, T17);
Chris@10 202 T24 = VFNMS(TI, TG, T23);
Chris@10 203 T19 = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
Chris@10 204 T1b = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
Chris@10 205 }
Chris@10 206 {
Chris@10 207 V T1g, T1k, T18, T2i, T1a, T2j, T1h, T2p, T1n, T1q;
Chris@10 208 T1g = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@10 209 T1k = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@10 210 TL = VADD(TE, TK);
Chris@10 211 T26 = VSUB(TE, TK);
Chris@10 212 T18 = VFMA(T5, T17, T16);
Chris@10 213 T2i = VFNMS(T5, T15, T2h);
Chris@10 214 T25 = VSUB(T22, T24);
Chris@10 215 T37 = VADD(T22, T24);
Chris@10 216 T1a = VMUL(T3, T19);
Chris@10 217 T2j = VMUL(T3, T1b);
Chris@10 218 T1h = VMUL(T1f, T1g);
Chris@10 219 T2p = VMUL(T1f, T1k);
Chris@10 220 T1n = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
Chris@10 221 T1q = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
Chris@10 222 {
Chris@10 223 V TQ, TU, TR, T29;
Chris@10 224 {
Chris@10 225 V T1c, T2k, T1o, T2r;
Chris@10 226 TQ = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
Chris@10 227 TU = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
Chris@10 228 T1c = VFMA(T6, T1b, T1a);
Chris@10 229 T2k = VFNMS(T6, T19, T2j);
Chris@10 230 T1l = VFMA(T1j, T1k, T1h);
Chris@10 231 T2q = VFNMS(T1j, T1g, T2p);
Chris@10 232 T1o = VMUL(T1m, T1n);
Chris@10 233 T2r = VMUL(T1m, T1q);
Chris@10 234 TR = VMUL(TP, TQ);
Chris@10 235 T29 = VMUL(TP, TU);
Chris@10 236 T1d = VADD(T18, T1c);
Chris@10 237 T2o = VSUB(T18, T1c);
Chris@10 238 T2l = VSUB(T2i, T2k);
Chris@10 239 T3c = VADD(T2i, T2k);
Chris@10 240 T1r = VFMA(T1p, T1q, T1o);
Chris@10 241 T2s = VFNMS(T1p, T1n, T2r);
Chris@10 242 TX = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@10 243 T10 = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@10 244 }
Chris@10 245 TV = VFMA(TT, TU, TR);
Chris@10 246 T2a = VFNMS(TT, TQ, T29);
Chris@10 247 }
Chris@10 248 }
Chris@10 249 }
Chris@10 250 }
Chris@10 251 {
Chris@10 252 V T36, Ty, T3B, T3G, T1s, T2m, T2t, T3d, TY, T2b, T3g, T1S, T3s, T3j;
Chris@10 253 T36 = VSUB(Te, Tx);
Chris@10 254 Ty = VADD(Te, Tx);
Chris@10 255 T3B = VADD(T3w, T3A);
Chris@10 256 T3G = VSUB(T3A, T3w);
Chris@10 257 T1s = VADD(T1l, T1r);
Chris@10 258 T2m = VSUB(T1l, T1r);
Chris@10 259 T2t = VSUB(T2q, T2s);
Chris@10 260 T3d = VADD(T2q, T2s);
Chris@10 261 TY = VMUL(TW, TX);
Chris@10 262 T2b = VMUL(TW, T10);
Chris@10 263 T3g = VSUB(T1G, T1R);
Chris@10 264 T1S = VADD(T1G, T1R);
Chris@10 265 T3s = VADD(T3h, T3i);
Chris@10 266 T3j = VSUB(T3h, T3i);
Chris@10 267 {
Chris@10 268 V T3D, T1T, T3u, T3t, T28, T12, T38, T2d, T3n, T3f;
Chris@10 269 {
Chris@10 270 V T1t, T3b, T3e, T3r, T11, T2c;
Chris@10 271 T1t = VADD(T1d, T1s);
Chris@10 272 T3b = VSUB(T1d, T1s);
Chris@10 273 T3e = VSUB(T3c, T3d);
Chris@10 274 T3r = VADD(T3c, T3d);
Chris@10 275 T11 = VFMA(TZ, T10, TY);
Chris@10 276 T2c = VFNMS(TZ, TX, T2b);
Chris@10 277 T3D = VSUB(T1S, T1t);
Chris@10 278 T1T = VADD(T1t, T1S);
Chris@10 279 T3u = VADD(T3r, T3s);
Chris@10 280 T3t = VSUB(T3r, T3s);
Chris@10 281 T28 = VSUB(TV, T11);
Chris@10 282 T12 = VADD(TV, T11);
Chris@10 283 T38 = VADD(T2a, T2c);
Chris@10 284 T2d = VSUB(T2a, T2c);
Chris@10 285 T3n = VSUB(T3e, T3b);
Chris@10 286 T3f = VADD(T3b, T3e);
Chris@10 287 }
Chris@10 288 {
Chris@10 289 V T2Q, T20, T3N, T3T, T2J, T2C, T2W, T2V, T3O, T2f, T3U, T2T;
Chris@10 290 {
Chris@10 291 V T2R, T27, T2e, T2S, T13, T3F;
Chris@10 292 T2Q = VADD(T1U, T1Z);
Chris@10 293 T20 = VSUB(T1U, T1Z);
Chris@10 294 T3N = VSUB(T3L, T3M);
Chris@10 295 T3T = VADD(T3M, T3L);
Chris@10 296 T13 = VADD(TL, T12);
Chris@10 297 T3F = VSUB(T12, TL);
Chris@10 298 {
Chris@10 299 V T3v, T39, T3o, T3k;
Chris@10 300 T3v = VADD(T37, T38);
Chris@10 301 T39 = VSUB(T37, T38);
Chris@10 302 T3o = VADD(T3g, T3j);
Chris@10 303 T3k = VSUB(T3g, T3j);
Chris@10 304 {
Chris@10 305 V T3H, T3J, T14, T3q;
Chris@10 306 T3H = VADD(T3F, T3G);
Chris@10 307 T3J = VSUB(T3G, T3F);
Chris@10 308 T14 = VADD(Ty, T13);
Chris@10 309 T3q = VSUB(Ty, T13);
Chris@10 310 {
Chris@10 311 V T3a, T3m, T3C, T3E;
Chris@10 312 T3a = VADD(T36, T39);
Chris@10 313 T3m = VSUB(T36, T39);
Chris@10 314 T3C = VADD(T3v, T3B);
Chris@10 315 T3E = VSUB(T3B, T3v);
Chris@10 316 {
Chris@10 317 V T3I, T3p, T3l, T3K;
Chris@10 318 T3I = VADD(T3n, T3o);
Chris@10 319 T3p = VSUB(T3n, T3o);
Chris@10 320 T3l = VADD(T3f, T3k);
Chris@10 321 T3K = VSUB(T3k, T3f);
Chris@10 322 ST(&(ri[WS(rs, 4)]), VADD(T3q, T3t), ms, &(ri[0]));
Chris@10 323 ST(&(ri[WS(rs, 12)]), VSUB(T3q, T3t), ms, &(ri[0]));
Chris@10 324 ST(&(ri[0]), VADD(T14, T1T), ms, &(ri[0]));
Chris@10 325 ST(&(ri[WS(rs, 8)]), VSUB(T14, T1T), ms, &(ri[0]));
Chris@10 326 ST(&(ii[WS(rs, 4)]), VADD(T3D, T3E), ms, &(ii[0]));
Chris@10 327 ST(&(ii[WS(rs, 12)]), VSUB(T3E, T3D), ms, &(ii[0]));
Chris@10 328 ST(&(ii[0]), VADD(T3u, T3C), ms, &(ii[0]));
Chris@10 329 ST(&(ii[WS(rs, 8)]), VSUB(T3C, T3u), ms, &(ii[0]));
Chris@10 330 ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP707106781), T3p, T3m), ms, &(ri[0]));
Chris@10 331 ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3p, T3m), ms, &(ri[0]));
Chris@10 332 ST(&(ii[WS(rs, 10)]), VFNMS(LDK(KP707106781), T3I, T3H), ms, &(ii[0]));
Chris@10 333 ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP707106781), T3I, T3H), ms, &(ii[0]));
Chris@10 334 ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP707106781), T3K, T3J), ms, &(ii[0]));
Chris@10 335 ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP707106781), T3K, T3J), ms, &(ii[0]));
Chris@10 336 ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP707106781), T3l, T3a), ms, &(ri[0]));
Chris@10 337 ST(&(ri[WS(rs, 10)]), VFNMS(LDK(KP707106781), T3l, T3a), ms, &(ri[0]));
Chris@10 338 T2R = VADD(T26, T25);
Chris@10 339 T27 = VSUB(T25, T26);
Chris@10 340 T2e = VADD(T28, T2d);
Chris@10 341 T2S = VSUB(T28, T2d);
Chris@10 342 }
Chris@10 343 }
Chris@10 344 }
Chris@10 345 }
Chris@10 346 {
Chris@10 347 V T2Y, T2Z, T2n, T2u;
Chris@10 348 T2J = VSUB(T2D, T2I);
Chris@10 349 T2Y = VADD(T2D, T2I);
Chris@10 350 T2Z = VSUB(T2A, T2B);
Chris@10 351 T2C = VADD(T2A, T2B);
Chris@10 352 T2W = VSUB(T2l, T2m);
Chris@10 353 T2n = VADD(T2l, T2m);
Chris@10 354 T2u = VSUB(T2o, T2t);
Chris@10 355 T2V = VADD(T2o, T2t);
Chris@10 356 T3O = VADD(T27, T2e);
Chris@10 357 T2f = VSUB(T27, T2e);
Chris@10 358 T34 = VFMA(LDK(KP414213562), T2Y, T2Z);
Chris@10 359 T30 = VFNMS(LDK(KP414213562), T2Z, T2Y);
Chris@10 360 T3U = VSUB(T2S, T2R);
Chris@10 361 T2T = VADD(T2R, T2S);
Chris@10 362 T2N = VFNMS(LDK(KP414213562), T2n, T2u);
Chris@10 363 T2v = VFMA(LDK(KP414213562), T2u, T2n);
Chris@10 364 }
Chris@10 365 }
Chris@10 366 T2M = VFNMS(LDK(KP707106781), T2f, T20);
Chris@10 367 T2g = VFMA(LDK(KP707106781), T2f, T20);
Chris@10 368 T3V = VFMA(LDK(KP707106781), T3U, T3T);
Chris@10 369 T3X = VFNMS(LDK(KP707106781), T3U, T3T);
Chris@10 370 T32 = VFNMS(LDK(KP707106781), T2T, T2Q);
Chris@10 371 T2U = VFMA(LDK(KP707106781), T2T, T2Q);
Chris@10 372 T33 = VFNMS(LDK(KP414213562), T2V, T2W);
Chris@10 373 T2X = VFMA(LDK(KP414213562), T2W, T2V);
Chris@10 374 T2O = VFMA(LDK(KP414213562), T2C, T2J);
Chris@10 375 T2K = VFNMS(LDK(KP414213562), T2J, T2C);
Chris@10 376 T3P = VFMA(LDK(KP707106781), T3O, T3N);
Chris@10 377 T3R = VFNMS(LDK(KP707106781), T3O, T3N);
Chris@10 378 }
Chris@10 379 }
Chris@10 380 }
Chris@10 381 }
Chris@10 382 }
Chris@10 383 {
Chris@10 384 V T3Q, T35, T31, T3S;
Chris@10 385 T3Q = VADD(T33, T34);
Chris@10 386 T35 = VSUB(T33, T34);
Chris@10 387 T31 = VADD(T2X, T30);
Chris@10 388 T3S = VSUB(T30, T2X);
Chris@10 389 {
Chris@10 390 V T3W, T2P, T2L, T3Y;
Chris@10 391 T3W = VSUB(T2O, T2N);
Chris@10 392 T2P = VADD(T2N, T2O);
Chris@10 393 T2L = VSUB(T2v, T2K);
Chris@10 394 T3Y = VADD(T2v, T2K);
Chris@10 395 ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP923879532), T35, T32), ms, &(ri[WS(rs, 1)]));
Chris@10 396 ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP923879532), T35, T32), ms, &(ri[WS(rs, 1)]));
Chris@10 397 ST(&(ii[WS(rs, 9)]), VFNMS(LDK(KP923879532), T3Q, T3P), ms, &(ii[WS(rs, 1)]));
Chris@10 398 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP923879532), T3Q, T3P), ms, &(ii[WS(rs, 1)]));
Chris@10 399 ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP923879532), T3S, T3R), ms, &(ii[WS(rs, 1)]));
Chris@10 400 ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP923879532), T3S, T3R), ms, &(ii[WS(rs, 1)]));
Chris@10 401 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP923879532), T31, T2U), ms, &(ri[WS(rs, 1)]));
Chris@10 402 ST(&(ri[WS(rs, 9)]), VFNMS(LDK(KP923879532), T31, T2U), ms, &(ri[WS(rs, 1)]));
Chris@10 403 ST(&(ri[WS(rs, 15)]), VFMA(LDK(KP923879532), T2P, T2M), ms, &(ri[WS(rs, 1)]));
Chris@10 404 ST(&(ri[WS(rs, 7)]), VFNMS(LDK(KP923879532), T2P, T2M), ms, &(ri[WS(rs, 1)]));
Chris@10 405 ST(&(ii[WS(rs, 11)]), VFNMS(LDK(KP923879532), T3W, T3V), ms, &(ii[WS(rs, 1)]));
Chris@10 406 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP923879532), T3W, T3V), ms, &(ii[WS(rs, 1)]));
Chris@10 407 ST(&(ii[WS(rs, 15)]), VFMA(LDK(KP923879532), T3Y, T3X), ms, &(ii[WS(rs, 1)]));
Chris@10 408 ST(&(ii[WS(rs, 7)]), VFNMS(LDK(KP923879532), T3Y, T3X), ms, &(ii[WS(rs, 1)]));
Chris@10 409 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP923879532), T2L, T2g), ms, &(ri[WS(rs, 1)]));
Chris@10 410 ST(&(ri[WS(rs, 11)]), VFNMS(LDK(KP923879532), T2L, T2g), ms, &(ri[WS(rs, 1)]));
Chris@10 411 }
Chris@10 412 }
Chris@10 413 }
Chris@10 414 }
Chris@10 415 VLEAVE();
Chris@10 416 }
Chris@10 417
Chris@10 418 static const tw_instr twinstr[] = {
Chris@10 419 VTW(0, 1),
Chris@10 420 VTW(0, 3),
Chris@10 421 VTW(0, 9),
Chris@10 422 VTW(0, 15),
Chris@10 423 {TW_NEXT, (2 * VL), 0}
Chris@10 424 };
Chris@10 425
Chris@10 426 static const ct_desc desc = { 16, XSIMD_STRING("t2sv_16"), twinstr, &GENUS, {104, 42, 92, 0}, 0, 0, 0 };
Chris@10 427
Chris@10 428 void XSIMD(codelet_t2sv_16) (planner *p) {
Chris@10 429 X(kdft_dit_register) (p, t2sv_16, &desc);
Chris@10 430 }
Chris@10 431 #else /* HAVE_FMA */
Chris@10 432
Chris@10 433 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 16 -name t2sv_16 -include ts.h */
Chris@10 434
Chris@10 435 /*
Chris@10 436 * This function contains 196 FP additions, 108 FP multiplications,
Chris@10 437 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
Chris@10 438 * 82 stack variables, 3 constants, and 64 memory accesses
Chris@10 439 */
Chris@10 440 #include "ts.h"
Chris@10 441
Chris@10 442 static void t2sv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 443 {
Chris@10 444 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 445 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 446 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 447 {
Chris@10 448 INT m;
Chris@10 449 for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@10 450 V T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
Chris@10 451 V Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
Chris@10 452 {
Chris@10 453 V T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
Chris@10 454 {
Chris@10 455 V Th, Tn, Tj, Tm;
Chris@10 456 T2 = LDW(&(W[0]));
Chris@10 457 T5 = LDW(&(W[TWVL * 1]));
Chris@10 458 Tg = LDW(&(W[TWVL * 2]));
Chris@10 459 Ti = LDW(&(W[TWVL * 3]));
Chris@10 460 Th = VMUL(T2, Tg);
Chris@10 461 Tn = VMUL(T5, Tg);
Chris@10 462 Tj = VMUL(T5, Ti);
Chris@10 463 Tm = VMUL(T2, Ti);
Chris@10 464 Tk = VSUB(Th, Tj);
Chris@10 465 To = VADD(Tm, Tn);
Chris@10 466 TE = VSUB(Tm, Tn);
Chris@10 467 TC = VADD(Th, Tj);
Chris@10 468 T6 = LDW(&(W[TWVL * 5]));
Chris@10 469 T7 = VMUL(T5, T6);
Chris@10 470 Tv = VMUL(Tg, T6);
Chris@10 471 Ta = VMUL(T2, T6);
Chris@10 472 Ts = VMUL(Ti, T6);
Chris@10 473 T3 = LDW(&(W[TWVL * 4]));
Chris@10 474 T4 = VMUL(T2, T3);
Chris@10 475 Tw = VMUL(Ti, T3);
Chris@10 476 Tb = VMUL(T5, T3);
Chris@10 477 Tr = VMUL(Tg, T3);
Chris@10 478 }
Chris@10 479 T8 = VADD(T4, T7);
Chris@10 480 TW = VSUB(Tv, Tw);
Chris@10 481 TJ = VADD(Ta, Tb);
Chris@10 482 Tt = VSUB(Tr, Ts);
Chris@10 483 TU = VADD(Tr, Ts);
Chris@10 484 Tc = VSUB(Ta, Tb);
Chris@10 485 Tx = VADD(Tv, Tw);
Chris@10 486 TH = VSUB(T4, T7);
Chris@10 487 TN = LDW(&(W[TWVL * 6]));
Chris@10 488 TO = LDW(&(W[TWVL * 7]));
Chris@10 489 TP = VFMA(T2, TN, VMUL(T5, TO));
Chris@10 490 TR = VFNMS(T5, TN, VMUL(T2, TO));
Chris@10 491 {
Chris@10 492 V T1d, T1e, T19, T1a;
Chris@10 493 T1d = VMUL(Tk, T6);
Chris@10 494 T1e = VMUL(To, T3);
Chris@10 495 T1f = VSUB(T1d, T1e);
Chris@10 496 T1k = VADD(T1d, T1e);
Chris@10 497 T19 = VMUL(Tk, T3);
Chris@10 498 T1a = VMUL(To, T6);
Chris@10 499 T1b = VADD(T19, T1a);
Chris@10 500 T1i = VSUB(T19, T1a);
Chris@10 501 }
Chris@10 502 {
Chris@10 503 V T1w, T1x, T1s, T1t;
Chris@10 504 T1w = VMUL(TC, T6);
Chris@10 505 T1x = VMUL(TE, T3);
Chris@10 506 T1y = VSUB(T1w, T1x);
Chris@10 507 T1H = VADD(T1w, T1x);
Chris@10 508 T1s = VMUL(TC, T3);
Chris@10 509 T1t = VMUL(TE, T6);
Chris@10 510 T1u = VADD(T1s, T1t);
Chris@10 511 T1F = VSUB(T1s, T1t);
Chris@10 512 }
Chris@10 513 }
Chris@10 514 {
Chris@10 515 V Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
Chris@10 516 V T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
Chris@10 517 V T2S, T2T, T28, T2A, T2d, T2B;
Chris@10 518 {
Chris@10 519 V T1, T3d, Te, T3c, T9, Td;
Chris@10 520 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@10 521 T3d = LD(&(ii[0]), ms, &(ii[0]));
Chris@10 522 T9 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
Chris@10 523 Td = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
Chris@10 524 Te = VFMA(T8, T9, VMUL(Tc, Td));
Chris@10 525 T3c = VFNMS(Tc, T9, VMUL(T8, Td));
Chris@10 526 Tf = VADD(T1, Te);
Chris@10 527 T3r = VSUB(T3d, T3c);
Chris@10 528 T1N = VSUB(T1, Te);
Chris@10 529 T3e = VADD(T3c, T3d);
Chris@10 530 }
Chris@10 531 {
Chris@10 532 V Tq, T1O, Tz, T1P;
Chris@10 533 {
Chris@10 534 V Tl, Tp, Tu, Ty;
Chris@10 535 Tl = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@10 536 Tp = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@10 537 Tq = VFMA(Tk, Tl, VMUL(To, Tp));
Chris@10 538 T1O = VFNMS(To, Tl, VMUL(Tk, Tp));
Chris@10 539 Tu = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
Chris@10 540 Ty = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
Chris@10 541 Tz = VFMA(Tt, Tu, VMUL(Tx, Ty));
Chris@10 542 T1P = VFNMS(Tx, Tu, VMUL(Tt, Ty));
Chris@10 543 }
Chris@10 544 TA = VADD(Tq, Tz);
Chris@10 545 T3s = VSUB(Tq, Tz);
Chris@10 546 T1Q = VSUB(T1O, T1P);
Chris@10 547 T3b = VADD(T1O, T1P);
Chris@10 548 }
Chris@10 549 {
Chris@10 550 V TG, T1S, TL, T1T, T1U, T1V;
Chris@10 551 {
Chris@10 552 V TD, TF, TI, TK;
Chris@10 553 TD = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@10 554 TF = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@10 555 TG = VFMA(TC, TD, VMUL(TE, TF));
Chris@10 556 T1S = VFNMS(TE, TD, VMUL(TC, TF));
Chris@10 557 TI = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
Chris@10 558 TK = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
Chris@10 559 TL = VFMA(TH, TI, VMUL(TJ, TK));
Chris@10 560 T1T = VFNMS(TJ, TI, VMUL(TH, TK));
Chris@10 561 }
Chris@10 562 TM = VADD(TG, TL);
Chris@10 563 T2M = VADD(T1S, T1T);
Chris@10 564 T1U = VSUB(T1S, T1T);
Chris@10 565 T1V = VSUB(TG, TL);
Chris@10 566 T1W = VSUB(T1U, T1V);
Chris@10 567 T2w = VADD(T1V, T1U);
Chris@10 568 }
Chris@10 569 {
Chris@10 570 V TT, T1Y, TY, T1Z, T1X, T20;
Chris@10 571 {
Chris@10 572 V TQ, TS, TV, TX;
Chris@10 573 TQ = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
Chris@10 574 TS = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
Chris@10 575 TT = VFMA(TP, TQ, VMUL(TR, TS));
Chris@10 576 T1Y = VFNMS(TR, TQ, VMUL(TP, TS));
Chris@10 577 TV = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@10 578 TX = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@10 579 TY = VFMA(TU, TV, VMUL(TW, TX));
Chris@10 580 T1Z = VFNMS(TW, TV, VMUL(TU, TX));
Chris@10 581 }
Chris@10 582 TZ = VADD(TT, TY);
Chris@10 583 T2N = VADD(T1Y, T1Z);
Chris@10 584 T1X = VSUB(TT, TY);
Chris@10 585 T20 = VSUB(T1Y, T1Z);
Chris@10 586 T21 = VADD(T1X, T20);
Chris@10 587 T2x = VSUB(T1X, T20);
Chris@10 588 }
Chris@10 589 {
Chris@10 590 V T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
Chris@10 591 {
Chris@10 592 V T1p, T1q, T1G, T1I;
Chris@10 593 T1p = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
Chris@10 594 T1q = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
Chris@10 595 T1r = VFMA(TN, T1p, VMUL(TO, T1q));
Chris@10 596 T2k = VFNMS(TO, T1p, VMUL(TN, T1q));
Chris@10 597 T1G = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
Chris@10 598 T1I = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
Chris@10 599 T1J = VFMA(T1F, T1G, VMUL(T1H, T1I));
Chris@10 600 T2h = VFNMS(T1H, T1G, VMUL(T1F, T1I));
Chris@10 601 }
Chris@10 602 {
Chris@10 603 V T1v, T1z, T1C, T1D;
Chris@10 604 T1v = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@10 605 T1z = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@10 606 T1A = VFMA(T1u, T1v, VMUL(T1y, T1z));
Chris@10 607 T2l = VFNMS(T1y, T1v, VMUL(T1u, T1z));
Chris@10 608 T1C = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@10 609 T1D = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@10 610 T1E = VFMA(Tg, T1C, VMUL(Ti, T1D));
Chris@10 611 T2g = VFNMS(Ti, T1C, VMUL(Tg, T1D));
Chris@10 612 }
Chris@10 613 T1B = VADD(T1r, T1A);
Chris@10 614 T1K = VADD(T1E, T1J);
Chris@10 615 T2V = VSUB(T1B, T1K);
Chris@10 616 T2W = VADD(T2k, T2l);
Chris@10 617 T2X = VADD(T2g, T2h);
Chris@10 618 T2Y = VSUB(T2W, T2X);
Chris@10 619 {
Chris@10 620 V T2f, T2i, T2m, T2n;
Chris@10 621 T2f = VSUB(T1r, T1A);
Chris@10 622 T2i = VSUB(T2g, T2h);
Chris@10 623 T2j = VSUB(T2f, T2i);
Chris@10 624 T2D = VADD(T2f, T2i);
Chris@10 625 T2m = VSUB(T2k, T2l);
Chris@10 626 T2n = VSUB(T1E, T1J);
Chris@10 627 T2o = VADD(T2m, T2n);
Chris@10 628 T2E = VSUB(T2m, T2n);
Chris@10 629 }
Chris@10 630 }
Chris@10 631 {
Chris@10 632 V T14, T24, T1m, T2b, T17, T25, T1h, T2a;
Chris@10 633 {
Chris@10 634 V T12, T13, T1j, T1l;
Chris@10 635 T12 = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@10 636 T13 = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@10 637 T14 = VFMA(T2, T12, VMUL(T5, T13));
Chris@10 638 T24 = VFNMS(T5, T12, VMUL(T2, T13));
Chris@10 639 T1j = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
Chris@10 640 T1l = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
Chris@10 641 T1m = VFMA(T1i, T1j, VMUL(T1k, T1l));
Chris@10 642 T2b = VFNMS(T1k, T1j, VMUL(T1i, T1l));
Chris@10 643 }
Chris@10 644 {
Chris@10 645 V T15, T16, T1c, T1g;
Chris@10 646 T15 = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
Chris@10 647 T16 = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
Chris@10 648 T17 = VFMA(T3, T15, VMUL(T6, T16));
Chris@10 649 T25 = VFNMS(T6, T15, VMUL(T3, T16));
Chris@10 650 T1c = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@10 651 T1g = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@10 652 T1h = VFMA(T1b, T1c, VMUL(T1f, T1g));
Chris@10 653 T2a = VFNMS(T1f, T1c, VMUL(T1b, T1g));
Chris@10 654 }
Chris@10 655 T18 = VADD(T14, T17);
Chris@10 656 T1n = VADD(T1h, T1m);
Chris@10 657 T2Q = VSUB(T18, T1n);
Chris@10 658 T2R = VADD(T24, T25);
Chris@10 659 T2S = VADD(T2a, T2b);
Chris@10 660 T2T = VSUB(T2R, T2S);
Chris@10 661 {
Chris@10 662 V T26, T27, T29, T2c;
Chris@10 663 T26 = VSUB(T24, T25);
Chris@10 664 T27 = VSUB(T1h, T1m);
Chris@10 665 T28 = VADD(T26, T27);
Chris@10 666 T2A = VSUB(T26, T27);
Chris@10 667 T29 = VSUB(T14, T17);
Chris@10 668 T2c = VSUB(T2a, T2b);
Chris@10 669 T2d = VSUB(T29, T2c);
Chris@10 670 T2B = VADD(T29, T2c);
Chris@10 671 }
Chris@10 672 }
Chris@10 673 {
Chris@10 674 V T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
Chris@10 675 {
Chris@10 676 V T1R, T22, T3y, T3z;
Chris@10 677 T1R = VSUB(T1N, T1Q);
Chris@10 678 T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
Chris@10 679 T23 = VADD(T1R, T22);
Chris@10 680 T2r = VSUB(T1R, T22);
Chris@10 681 T3y = VMUL(LDK(KP707106781), VSUB(T2x, T2w));
Chris@10 682 T3z = VADD(T3s, T3r);
Chris@10 683 T3A = VADD(T3y, T3z);
Chris@10 684 T3C = VSUB(T3z, T3y);
Chris@10 685 }
Chris@10 686 {
Chris@10 687 V T2e, T2p, T2s, T2t;
Chris@10 688 T2e = VFMA(LDK(KP923879532), T28, VMUL(LDK(KP382683432), T2d));
Chris@10 689 T2p = VFNMS(LDK(KP923879532), T2o, VMUL(LDK(KP382683432), T2j));
Chris@10 690 T2q = VADD(T2e, T2p);
Chris@10 691 T3B = VSUB(T2p, T2e);
Chris@10 692 T2s = VFNMS(LDK(KP923879532), T2d, VMUL(LDK(KP382683432), T28));
Chris@10 693 T2t = VFMA(LDK(KP382683432), T2o, VMUL(LDK(KP923879532), T2j));
Chris@10 694 T2u = VSUB(T2s, T2t);
Chris@10 695 T3x = VADD(T2s, T2t);
Chris@10 696 }
Chris@10 697 ST(&(ri[WS(rs, 11)]), VSUB(T23, T2q), ms, &(ri[WS(rs, 1)]));
Chris@10 698 ST(&(ii[WS(rs, 11)]), VSUB(T3A, T3x), ms, &(ii[WS(rs, 1)]));
Chris@10 699 ST(&(ri[WS(rs, 3)]), VADD(T23, T2q), ms, &(ri[WS(rs, 1)]));
Chris@10 700 ST(&(ii[WS(rs, 3)]), VADD(T3x, T3A), ms, &(ii[WS(rs, 1)]));
Chris@10 701 ST(&(ri[WS(rs, 15)]), VSUB(T2r, T2u), ms, &(ri[WS(rs, 1)]));
Chris@10 702 ST(&(ii[WS(rs, 15)]), VSUB(T3C, T3B), ms, &(ii[WS(rs, 1)]));
Chris@10 703 ST(&(ri[WS(rs, 7)]), VADD(T2r, T2u), ms, &(ri[WS(rs, 1)]));
Chris@10 704 ST(&(ii[WS(rs, 7)]), VADD(T3B, T3C), ms, &(ii[WS(rs, 1)]));
Chris@10 705 }
Chris@10 706 {
Chris@10 707 V T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
Chris@10 708 {
Chris@10 709 V T2L, T2O, T3k, T3l;
Chris@10 710 T2L = VSUB(Tf, TA);
Chris@10 711 T2O = VSUB(T2M, T2N);
Chris@10 712 T2P = VADD(T2L, T2O);
Chris@10 713 T31 = VSUB(T2L, T2O);
Chris@10 714 T3k = VSUB(TZ, TM);
Chris@10 715 T3l = VSUB(T3e, T3b);
Chris@10 716 T3m = VADD(T3k, T3l);
Chris@10 717 T3o = VSUB(T3l, T3k);
Chris@10 718 }
Chris@10 719 {
Chris@10 720 V T2U, T2Z, T32, T33;
Chris@10 721 T2U = VADD(T2Q, T2T);
Chris@10 722 T2Z = VSUB(T2V, T2Y);
Chris@10 723 T30 = VMUL(LDK(KP707106781), VADD(T2U, T2Z));
Chris@10 724 T3n = VMUL(LDK(KP707106781), VSUB(T2Z, T2U));
Chris@10 725 T32 = VSUB(T2T, T2Q);
Chris@10 726 T33 = VADD(T2V, T2Y);
Chris@10 727 T34 = VMUL(LDK(KP707106781), VSUB(T32, T33));
Chris@10 728 T3j = VMUL(LDK(KP707106781), VADD(T32, T33));
Chris@10 729 }
Chris@10 730 ST(&(ri[WS(rs, 10)]), VSUB(T2P, T30), ms, &(ri[0]));
Chris@10 731 ST(&(ii[WS(rs, 10)]), VSUB(T3m, T3j), ms, &(ii[0]));
Chris@10 732 ST(&(ri[WS(rs, 2)]), VADD(T2P, T30), ms, &(ri[0]));
Chris@10 733 ST(&(ii[WS(rs, 2)]), VADD(T3j, T3m), ms, &(ii[0]));
Chris@10 734 ST(&(ri[WS(rs, 14)]), VSUB(T31, T34), ms, &(ri[0]));
Chris@10 735 ST(&(ii[WS(rs, 14)]), VSUB(T3o, T3n), ms, &(ii[0]));
Chris@10 736 ST(&(ri[WS(rs, 6)]), VADD(T31, T34), ms, &(ri[0]));
Chris@10 737 ST(&(ii[WS(rs, 6)]), VADD(T3n, T3o), ms, &(ii[0]));
Chris@10 738 }
Chris@10 739 {
Chris@10 740 V T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
Chris@10 741 {
Chris@10 742 V T2v, T2y, T3q, T3t;
Chris@10 743 T2v = VADD(T1N, T1Q);
Chris@10 744 T2y = VMUL(LDK(KP707106781), VADD(T2w, T2x));
Chris@10 745 T2z = VADD(T2v, T2y);
Chris@10 746 T2H = VSUB(T2v, T2y);
Chris@10 747 T3q = VMUL(LDK(KP707106781), VADD(T1W, T21));
Chris@10 748 T3t = VSUB(T3r, T3s);
Chris@10 749 T3u = VADD(T3q, T3t);
Chris@10 750 T3w = VSUB(T3t, T3q);
Chris@10 751 }
Chris@10 752 {
Chris@10 753 V T2C, T2F, T2I, T2J;
Chris@10 754 T2C = VFMA(LDK(KP382683432), T2A, VMUL(LDK(KP923879532), T2B));
Chris@10 755 T2F = VFNMS(LDK(KP382683432), T2E, VMUL(LDK(KP923879532), T2D));
Chris@10 756 T2G = VADD(T2C, T2F);
Chris@10 757 T3v = VSUB(T2F, T2C);
Chris@10 758 T2I = VFNMS(LDK(KP382683432), T2B, VMUL(LDK(KP923879532), T2A));
Chris@10 759 T2J = VFMA(LDK(KP923879532), T2E, VMUL(LDK(KP382683432), T2D));
Chris@10 760 T2K = VSUB(T2I, T2J);
Chris@10 761 T3p = VADD(T2I, T2J);
Chris@10 762 }
Chris@10 763 ST(&(ri[WS(rs, 9)]), VSUB(T2z, T2G), ms, &(ri[WS(rs, 1)]));
Chris@10 764 ST(&(ii[WS(rs, 9)]), VSUB(T3u, T3p), ms, &(ii[WS(rs, 1)]));
Chris@10 765 ST(&(ri[WS(rs, 1)]), VADD(T2z, T2G), ms, &(ri[WS(rs, 1)]));
Chris@10 766 ST(&(ii[WS(rs, 1)]), VADD(T3p, T3u), ms, &(ii[WS(rs, 1)]));
Chris@10 767 ST(&(ri[WS(rs, 13)]), VSUB(T2H, T2K), ms, &(ri[WS(rs, 1)]));
Chris@10 768 ST(&(ii[WS(rs, 13)]), VSUB(T3w, T3v), ms, &(ii[WS(rs, 1)]));
Chris@10 769 ST(&(ri[WS(rs, 5)]), VADD(T2H, T2K), ms, &(ri[WS(rs, 1)]));
Chris@10 770 ST(&(ii[WS(rs, 5)]), VADD(T3v, T3w), ms, &(ii[WS(rs, 1)]));
Chris@10 771 }
Chris@10 772 {
Chris@10 773 V T11, T35, T3g, T3i, T1M, T3h, T38, T39;
Chris@10 774 {
Chris@10 775 V TB, T10, T3a, T3f;
Chris@10 776 TB = VADD(Tf, TA);
Chris@10 777 T10 = VADD(TM, TZ);
Chris@10 778 T11 = VADD(TB, T10);
Chris@10 779 T35 = VSUB(TB, T10);
Chris@10 780 T3a = VADD(T2M, T2N);
Chris@10 781 T3f = VADD(T3b, T3e);
Chris@10 782 T3g = VADD(T3a, T3f);
Chris@10 783 T3i = VSUB(T3f, T3a);
Chris@10 784 }
Chris@10 785 {
Chris@10 786 V T1o, T1L, T36, T37;
Chris@10 787 T1o = VADD(T18, T1n);
Chris@10 788 T1L = VADD(T1B, T1K);
Chris@10 789 T1M = VADD(T1o, T1L);
Chris@10 790 T3h = VSUB(T1L, T1o);
Chris@10 791 T36 = VADD(T2R, T2S);
Chris@10 792 T37 = VADD(T2W, T2X);
Chris@10 793 T38 = VSUB(T36, T37);
Chris@10 794 T39 = VADD(T36, T37);
Chris@10 795 }
Chris@10 796 ST(&(ri[WS(rs, 8)]), VSUB(T11, T1M), ms, &(ri[0]));
Chris@10 797 ST(&(ii[WS(rs, 8)]), VSUB(T3g, T39), ms, &(ii[0]));
Chris@10 798 ST(&(ri[0]), VADD(T11, T1M), ms, &(ri[0]));
Chris@10 799 ST(&(ii[0]), VADD(T39, T3g), ms, &(ii[0]));
Chris@10 800 ST(&(ri[WS(rs, 12)]), VSUB(T35, T38), ms, &(ri[0]));
Chris@10 801 ST(&(ii[WS(rs, 12)]), VSUB(T3i, T3h), ms, &(ii[0]));
Chris@10 802 ST(&(ri[WS(rs, 4)]), VADD(T35, T38), ms, &(ri[0]));
Chris@10 803 ST(&(ii[WS(rs, 4)]), VADD(T3h, T3i), ms, &(ii[0]));
Chris@10 804 }
Chris@10 805 }
Chris@10 806 }
Chris@10 807 }
Chris@10 808 VLEAVE();
Chris@10 809 }
Chris@10 810
Chris@10 811 static const tw_instr twinstr[] = {
Chris@10 812 VTW(0, 1),
Chris@10 813 VTW(0, 3),
Chris@10 814 VTW(0, 9),
Chris@10 815 VTW(0, 15),
Chris@10 816 {TW_NEXT, (2 * VL), 0}
Chris@10 817 };
Chris@10 818
Chris@10 819 static const ct_desc desc = { 16, XSIMD_STRING("t2sv_16"), twinstr, &GENUS, {156, 68, 40, 0}, 0, 0, 0 };
Chris@10 820
Chris@10 821 void XSIMD(codelet_t2sv_16) (planner *p) {
Chris@10 822 X(kdft_dit_register) (p, t2sv_16, &desc);
Chris@10 823 }
Chris@10 824 #endif /* HAVE_FMA */