annotate src/fftw-3.3.8/dft/simd/common/t3fv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:51 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3fv_32 -include dft/simd/t3f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 244 FP additions, 214 FP multiplications,
Chris@82 32 * (or, 146 additions, 116 multiplications, 98 fused multiply/add),
Chris@82 33 * 90 stack variables, 7 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t3f.h"
Chris@82 36
Chris@82 37 static void t3fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 40 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 41 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 44 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 46 {
Chris@82 47 INT m;
Chris@82 48 R *x;
Chris@82 49 x = ri;
Chris@82 50 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 51 V T2, T5, T3, T4, Tc, T1C, TT, Tz, Tn, T6, TP, Tf, TK, T7, T8;
Chris@82 52 V Tv, T1w, T21, Tg, Tk, T1D, T1O, TC, T18, T12, T1t, TH, TL, TQ, T1m;
Chris@82 53 V T1c;
Chris@82 54 T2 = LDW(&(W[0]));
Chris@82 55 T5 = LDW(&(W[TWVL * 4]));
Chris@82 56 T3 = LDW(&(W[TWVL * 2]));
Chris@82 57 T4 = VZMULJ(T2, T3);
Chris@82 58 Tc = VZMUL(T2, T3);
Chris@82 59 T1C = VZMULJ(T2, T5);
Chris@82 60 TT = VZMULJ(T3, T5);
Chris@82 61 Tz = VZMUL(T2, T5);
Chris@82 62 Tn = VZMUL(T3, T5);
Chris@82 63 T6 = VZMUL(T4, T5);
Chris@82 64 TP = VZMULJ(Tc, T5);
Chris@82 65 Tf = VZMULJ(T4, T5);
Chris@82 66 TK = VZMUL(Tc, T5);
Chris@82 67 T7 = LDW(&(W[TWVL * 6]));
Chris@82 68 T8 = VZMULJ(T6, T7);
Chris@82 69 Tv = VZMULJ(T5, T7);
Chris@82 70 T1w = VZMULJ(Tn, T7);
Chris@82 71 T21 = VZMULJ(T3, T7);
Chris@82 72 Tg = VZMULJ(Tf, T7);
Chris@82 73 Tk = VZMUL(T2, T7);
Chris@82 74 T1D = VZMULJ(T1C, T7);
Chris@82 75 T1O = VZMULJ(Tc, T7);
Chris@82 76 TC = VZMULJ(T2, T7);
Chris@82 77 T18 = VZMULJ(TT, T7);
Chris@82 78 T12 = VZMULJ(Tz, T7);
Chris@82 79 T1t = VZMUL(Tc, T7);
Chris@82 80 TH = VZMUL(T3, T7);
Chris@82 81 TL = VZMULJ(TK, T7);
Chris@82 82 TQ = VZMULJ(TP, T7);
Chris@82 83 T1m = VZMULJ(T4, T7);
Chris@82 84 T1c = VZMUL(T4, T7);
Chris@82 85 {
Chris@82 86 V Tb, T24, T2T, T3x, Tr, T25, T2W, T3K, TX, T28, T3g, T3z, TG, T27, T3j;
Chris@82 87 V T3y, T1N, T2v, T3a, T3G, T1V, T2w, T37, T3F, T1j, T2s, T33, T3D, T1r, T2t;
Chris@82 88 V T30, T3C;
Chris@82 89 {
Chris@82 90 V T1, T23, Ta, T20, T22, T9, T1Z, T2R, T2S;
Chris@82 91 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 92 T22 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@82 93 T23 = VZMULJ(T21, T22);
Chris@82 94 T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 95 Ta = VZMULJ(T8, T9);
Chris@82 96 T1Z = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 97 T20 = VZMULJ(T1C, T1Z);
Chris@82 98 Tb = VSUB(T1, Ta);
Chris@82 99 T24 = VSUB(T20, T23);
Chris@82 100 T2R = VADD(T1, Ta);
Chris@82 101 T2S = VADD(T20, T23);
Chris@82 102 T2T = VADD(T2R, T2S);
Chris@82 103 T3x = VSUB(T2R, T2S);
Chris@82 104 }
Chris@82 105 {
Chris@82 106 V Te, Tp, Ti, Tm;
Chris@82 107 {
Chris@82 108 V Td, To, Th, Tl;
Chris@82 109 Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 110 Te = VZMULJ(Tc, Td);
Chris@82 111 To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 112 Tp = VZMULJ(Tn, To);
Chris@82 113 Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@82 114 Ti = VZMULJ(Tg, Th);
Chris@82 115 Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@82 116 Tm = VZMULJ(Tk, Tl);
Chris@82 117 }
Chris@82 118 {
Chris@82 119 V Tj, Tq, T2U, T2V;
Chris@82 120 Tj = VSUB(Te, Ti);
Chris@82 121 Tq = VSUB(Tm, Tp);
Chris@82 122 Tr = VADD(Tj, Tq);
Chris@82 123 T25 = VSUB(Tq, Tj);
Chris@82 124 T2U = VADD(Te, Ti);
Chris@82 125 T2V = VADD(Tm, Tp);
Chris@82 126 T2W = VADD(T2U, T2V);
Chris@82 127 T3K = VSUB(T2V, T2U);
Chris@82 128 }
Chris@82 129 }
Chris@82 130 {
Chris@82 131 V TJ, TV, TN, TS;
Chris@82 132 {
Chris@82 133 V TI, TU, TM, TR;
Chris@82 134 TI = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@82 135 TJ = VZMULJ(TH, TI);
Chris@82 136 TU = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 137 TV = VZMULJ(TT, TU);
Chris@82 138 TM = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 139 TN = VZMULJ(TL, TM);
Chris@82 140 TR = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@82 141 TS = VZMULJ(TQ, TR);
Chris@82 142 }
Chris@82 143 {
Chris@82 144 V TO, TW, T3e, T3f;
Chris@82 145 TO = VSUB(TJ, TN);
Chris@82 146 TW = VSUB(TS, TV);
Chris@82 147 TX = VFNMS(LDK(KP414213562), TW, TO);
Chris@82 148 T28 = VFMA(LDK(KP414213562), TO, TW);
Chris@82 149 T3e = VADD(TJ, TN);
Chris@82 150 T3f = VADD(TV, TS);
Chris@82 151 T3g = VADD(T3e, T3f);
Chris@82 152 T3z = VSUB(T3e, T3f);
Chris@82 153 }
Chris@82 154 }
Chris@82 155 {
Chris@82 156 V Tu, TE, Tx, TB;
Chris@82 157 {
Chris@82 158 V Tt, TD, Tw, TA;
Chris@82 159 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 160 Tu = VZMULJ(T4, Tt);
Chris@82 161 TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@82 162 TE = VZMULJ(TC, TD);
Chris@82 163 Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 164 Tx = VZMULJ(Tv, Tw);
Chris@82 165 TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 166 TB = VZMULJ(Tz, TA);
Chris@82 167 }
Chris@82 168 {
Chris@82 169 V Ty, TF, T3h, T3i;
Chris@82 170 Ty = VSUB(Tu, Tx);
Chris@82 171 TF = VSUB(TB, TE);
Chris@82 172 TG = VFNMS(LDK(KP414213562), TF, Ty);
Chris@82 173 T27 = VFMA(LDK(KP414213562), Ty, TF);
Chris@82 174 T3h = VADD(Tu, Tx);
Chris@82 175 T3i = VADD(TB, TE);
Chris@82 176 T3j = VADD(T3h, T3i);
Chris@82 177 T3y = VSUB(T3h, T3i);
Chris@82 178 }
Chris@82 179 }
Chris@82 180 {
Chris@82 181 V T1v, T1y, T1S, T1Q, T1I, T1K, T1L, T1B, T1F, T1G;
Chris@82 182 {
Chris@82 183 V T1u, T1x, T1R, T1P;
Chris@82 184 T1u = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@82 185 T1v = VZMULJ(T1t, T1u);
Chris@82 186 T1x = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 187 T1y = VZMULJ(T1w, T1x);
Chris@82 188 T1R = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 189 T1S = VZMULJ(Tf, T1R);
Chris@82 190 T1P = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@82 191 T1Q = VZMULJ(T1O, T1P);
Chris@82 192 {
Chris@82 193 V T1H, T1J, T1A, T1E;
Chris@82 194 T1H = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@82 195 T1I = VZMULJ(T7, T1H);
Chris@82 196 T1J = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 197 T1K = VZMULJ(T6, T1J);
Chris@82 198 T1L = VSUB(T1I, T1K);
Chris@82 199 T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 200 T1B = VZMULJ(T3, T1A);
Chris@82 201 T1E = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 202 T1F = VZMULJ(T1D, T1E);
Chris@82 203 T1G = VSUB(T1B, T1F);
Chris@82 204 }
Chris@82 205 }
Chris@82 206 {
Chris@82 207 V T1z, T1M, T38, T39;
Chris@82 208 T1z = VSUB(T1v, T1y);
Chris@82 209 T1M = VADD(T1G, T1L);
Chris@82 210 T1N = VFMA(LDK(KP707106781), T1M, T1z);
Chris@82 211 T2v = VFNMS(LDK(KP707106781), T1M, T1z);
Chris@82 212 T38 = VADD(T1B, T1F);
Chris@82 213 T39 = VADD(T1I, T1K);
Chris@82 214 T3a = VADD(T38, T39);
Chris@82 215 T3G = VSUB(T39, T38);
Chris@82 216 }
Chris@82 217 {
Chris@82 218 V T1T, T1U, T35, T36;
Chris@82 219 T1T = VSUB(T1Q, T1S);
Chris@82 220 T1U = VSUB(T1L, T1G);
Chris@82 221 T1V = VFMA(LDK(KP707106781), T1U, T1T);
Chris@82 222 T2w = VFNMS(LDK(KP707106781), T1U, T1T);
Chris@82 223 T35 = VADD(T1v, T1y);
Chris@82 224 T36 = VADD(T1S, T1Q);
Chris@82 225 T37 = VADD(T35, T36);
Chris@82 226 T3F = VSUB(T35, T36);
Chris@82 227 }
Chris@82 228 }
Chris@82 229 {
Chris@82 230 V T11, T14, T1o, T1l, T1e, T1g, T1h, T17, T1a, T1b;
Chris@82 231 {
Chris@82 232 V T10, T13, T1n, T1k;
Chris@82 233 T10 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 234 T11 = VZMULJ(T2, T10);
Chris@82 235 T13 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 236 T14 = VZMULJ(T12, T13);
Chris@82 237 T1n = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@82 238 T1o = VZMULJ(T1m, T1n);
Chris@82 239 T1k = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 240 T1l = VZMULJ(T5, T1k);
Chris@82 241 {
Chris@82 242 V T1d, T1f, T16, T19;
Chris@82 243 T1d = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@82 244 T1e = VZMULJ(T1c, T1d);
Chris@82 245 T1f = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 246 T1g = VZMULJ(TK, T1f);
Chris@82 247 T1h = VSUB(T1e, T1g);
Chris@82 248 T16 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 249 T17 = VZMULJ(TP, T16);
Chris@82 250 T19 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@82 251 T1a = VZMULJ(T18, T19);
Chris@82 252 T1b = VSUB(T17, T1a);
Chris@82 253 }
Chris@82 254 }
Chris@82 255 {
Chris@82 256 V T15, T1i, T31, T32;
Chris@82 257 T15 = VSUB(T11, T14);
Chris@82 258 T1i = VADD(T1b, T1h);
Chris@82 259 T1j = VFMA(LDK(KP707106781), T1i, T15);
Chris@82 260 T2s = VFNMS(LDK(KP707106781), T1i, T15);
Chris@82 261 T31 = VADD(T17, T1a);
Chris@82 262 T32 = VADD(T1e, T1g);
Chris@82 263 T33 = VADD(T31, T32);
Chris@82 264 T3D = VSUB(T31, T32);
Chris@82 265 }
Chris@82 266 {
Chris@82 267 V T1p, T1q, T2Y, T2Z;
Chris@82 268 T1p = VSUB(T1l, T1o);
Chris@82 269 T1q = VSUB(T1b, T1h);
Chris@82 270 T1r = VFMA(LDK(KP707106781), T1q, T1p);
Chris@82 271 T2t = VFNMS(LDK(KP707106781), T1q, T1p);
Chris@82 272 T2Y = VADD(T11, T14);
Chris@82 273 T2Z = VADD(T1l, T1o);
Chris@82 274 T30 = VADD(T2Y, T2Z);
Chris@82 275 T3C = VSUB(T2Y, T2Z);
Chris@82 276 }
Chris@82 277 }
Chris@82 278 {
Chris@82 279 V T3r, T3v, T3u, T3w;
Chris@82 280 {
Chris@82 281 V T3p, T3q, T3s, T3t;
Chris@82 282 T3p = VADD(T2T, T2W);
Chris@82 283 T3q = VADD(T3j, T3g);
Chris@82 284 T3r = VADD(T3p, T3q);
Chris@82 285 T3v = VSUB(T3p, T3q);
Chris@82 286 T3s = VADD(T30, T33);
Chris@82 287 T3t = VADD(T37, T3a);
Chris@82 288 T3u = VADD(T3s, T3t);
Chris@82 289 T3w = VSUB(T3t, T3s);
Chris@82 290 }
Chris@82 291 ST(&(x[WS(rs, 16)]), VSUB(T3r, T3u), ms, &(x[0]));
Chris@82 292 ST(&(x[WS(rs, 8)]), VFMAI(T3w, T3v), ms, &(x[0]));
Chris@82 293 ST(&(x[0]), VADD(T3r, T3u), ms, &(x[0]));
Chris@82 294 ST(&(x[WS(rs, 24)]), VFNMSI(T3w, T3v), ms, &(x[0]));
Chris@82 295 }
Chris@82 296 {
Chris@82 297 V T2X, T3k, T3c, T3l, T34, T3b;
Chris@82 298 T2X = VSUB(T2T, T2W);
Chris@82 299 T3k = VSUB(T3g, T3j);
Chris@82 300 T34 = VSUB(T30, T33);
Chris@82 301 T3b = VSUB(T37, T3a);
Chris@82 302 T3c = VADD(T34, T3b);
Chris@82 303 T3l = VSUB(T3b, T34);
Chris@82 304 {
Chris@82 305 V T3d, T3m, T3n, T3o;
Chris@82 306 T3d = VFNMS(LDK(KP707106781), T3c, T2X);
Chris@82 307 T3m = VFNMS(LDK(KP707106781), T3l, T3k);
Chris@82 308 ST(&(x[WS(rs, 12)]), VFNMSI(T3m, T3d), ms, &(x[0]));
Chris@82 309 ST(&(x[WS(rs, 20)]), VFMAI(T3m, T3d), ms, &(x[0]));
Chris@82 310 T3n = VFMA(LDK(KP707106781), T3c, T2X);
Chris@82 311 T3o = VFMA(LDK(KP707106781), T3l, T3k);
Chris@82 312 ST(&(x[WS(rs, 28)]), VFNMSI(T3o, T3n), ms, &(x[0]));
Chris@82 313 ST(&(x[WS(rs, 4)]), VFMAI(T3o, T3n), ms, &(x[0]));
Chris@82 314 }
Chris@82 315 }
Chris@82 316 {
Chris@82 317 V T3B, T3T, T3M, T3W, T3I, T3X, T3P, T3U, T3A, T3L;
Chris@82 318 T3A = VADD(T3y, T3z);
Chris@82 319 T3B = VFMA(LDK(KP707106781), T3A, T3x);
Chris@82 320 T3T = VFNMS(LDK(KP707106781), T3A, T3x);
Chris@82 321 T3L = VSUB(T3z, T3y);
Chris@82 322 T3M = VFMA(LDK(KP707106781), T3L, T3K);
Chris@82 323 T3W = VFNMS(LDK(KP707106781), T3L, T3K);
Chris@82 324 {
Chris@82 325 V T3E, T3H, T3N, T3O;
Chris@82 326 T3E = VFNMS(LDK(KP414213562), T3D, T3C);
Chris@82 327 T3H = VFNMS(LDK(KP414213562), T3G, T3F);
Chris@82 328 T3I = VADD(T3E, T3H);
Chris@82 329 T3X = VSUB(T3H, T3E);
Chris@82 330 T3N = VFMA(LDK(KP414213562), T3F, T3G);
Chris@82 331 T3O = VFMA(LDK(KP414213562), T3C, T3D);
Chris@82 332 T3P = VSUB(T3N, T3O);
Chris@82 333 T3U = VADD(T3O, T3N);
Chris@82 334 }
Chris@82 335 {
Chris@82 336 V T3J, T3Q, T3Z, T40;
Chris@82 337 T3J = VFNMS(LDK(KP923879532), T3I, T3B);
Chris@82 338 T3Q = VFNMS(LDK(KP923879532), T3P, T3M);
Chris@82 339 ST(&(x[WS(rs, 14)]), VFNMSI(T3Q, T3J), ms, &(x[0]));
Chris@82 340 ST(&(x[WS(rs, 18)]), VFMAI(T3Q, T3J), ms, &(x[0]));
Chris@82 341 T3Z = VFMA(LDK(KP923879532), T3U, T3T);
Chris@82 342 T40 = VFNMS(LDK(KP923879532), T3X, T3W);
Chris@82 343 ST(&(x[WS(rs, 6)]), VFNMSI(T40, T3Z), ms, &(x[0]));
Chris@82 344 ST(&(x[WS(rs, 26)]), VFMAI(T40, T3Z), ms, &(x[0]));
Chris@82 345 }
Chris@82 346 {
Chris@82 347 V T3R, T3S, T3V, T3Y;
Chris@82 348 T3R = VFMA(LDK(KP923879532), T3I, T3B);
Chris@82 349 T3S = VFMA(LDK(KP923879532), T3P, T3M);
Chris@82 350 ST(&(x[WS(rs, 30)]), VFNMSI(T3S, T3R), ms, &(x[0]));
Chris@82 351 ST(&(x[WS(rs, 2)]), VFMAI(T3S, T3R), ms, &(x[0]));
Chris@82 352 T3V = VFNMS(LDK(KP923879532), T3U, T3T);
Chris@82 353 T3Y = VFMA(LDK(KP923879532), T3X, T3W);
Chris@82 354 ST(&(x[WS(rs, 10)]), VFMAI(T3Y, T3V), ms, &(x[0]));
Chris@82 355 ST(&(x[WS(rs, 22)]), VFNMSI(T3Y, T3V), ms, &(x[0]));
Chris@82 356 }
Chris@82 357 }
Chris@82 358 {
Chris@82 359 V TZ, T2h, T2d, T2i, T1X, T2l, T2a, T2k;
Chris@82 360 {
Chris@82 361 V Ts, TY, T2b, T2c;
Chris@82 362 Ts = VFMA(LDK(KP707106781), Tr, Tb);
Chris@82 363 TY = VADD(TG, TX);
Chris@82 364 TZ = VFMA(LDK(KP923879532), TY, Ts);
Chris@82 365 T2h = VFNMS(LDK(KP923879532), TY, Ts);
Chris@82 366 T2b = VFMA(LDK(KP198912367), T1j, T1r);
Chris@82 367 T2c = VFMA(LDK(KP198912367), T1N, T1V);
Chris@82 368 T2d = VSUB(T2b, T2c);
Chris@82 369 T2i = VADD(T2b, T2c);
Chris@82 370 }
Chris@82 371 {
Chris@82 372 V T1s, T1W, T26, T29;
Chris@82 373 T1s = VFNMS(LDK(KP198912367), T1r, T1j);
Chris@82 374 T1W = VFNMS(LDK(KP198912367), T1V, T1N);
Chris@82 375 T1X = VADD(T1s, T1W);
Chris@82 376 T2l = VSUB(T1W, T1s);
Chris@82 377 T26 = VFNMS(LDK(KP707106781), T25, T24);
Chris@82 378 T29 = VSUB(T27, T28);
Chris@82 379 T2a = VFMA(LDK(KP923879532), T29, T26);
Chris@82 380 T2k = VFNMS(LDK(KP923879532), T29, T26);
Chris@82 381 }
Chris@82 382 {
Chris@82 383 V T1Y, T2e, T2n, T2o;
Chris@82 384 T1Y = VFNMS(LDK(KP980785280), T1X, TZ);
Chris@82 385 T2e = VFNMS(LDK(KP980785280), T2d, T2a);
Chris@82 386 ST(&(x[WS(rs, 17)]), VFNMSI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
Chris@82 387 ST(&(x[WS(rs, 15)]), VFMAI(T2e, T1Y), ms, &(x[WS(rs, 1)]));
Chris@82 388 T2n = VFMA(LDK(KP980785280), T2i, T2h);
Chris@82 389 T2o = VFMA(LDK(KP980785280), T2l, T2k);
Chris@82 390 ST(&(x[WS(rs, 7)]), VFMAI(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@82 391 ST(&(x[WS(rs, 25)]), VFNMSI(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@82 392 }
Chris@82 393 {
Chris@82 394 V T2f, T2g, T2j, T2m;
Chris@82 395 T2f = VFMA(LDK(KP980785280), T1X, TZ);
Chris@82 396 T2g = VFMA(LDK(KP980785280), T2d, T2a);
Chris@82 397 ST(&(x[WS(rs, 1)]), VFNMSI(T2g, T2f), ms, &(x[WS(rs, 1)]));
Chris@82 398 ST(&(x[WS(rs, 31)]), VFMAI(T2g, T2f), ms, &(x[WS(rs, 1)]));
Chris@82 399 T2j = VFNMS(LDK(KP980785280), T2i, T2h);
Chris@82 400 T2m = VFNMS(LDK(KP980785280), T2l, T2k);
Chris@82 401 ST(&(x[WS(rs, 9)]), VFNMSI(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@82 402 ST(&(x[WS(rs, 23)]), VFMAI(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@82 403 }
Chris@82 404 }
Chris@82 405 {
Chris@82 406 V T2r, T2J, T2F, T2K, T2y, T2N, T2C, T2M;
Chris@82 407 {
Chris@82 408 V T2p, T2q, T2D, T2E;
Chris@82 409 T2p = VFNMS(LDK(KP707106781), Tr, Tb);
Chris@82 410 T2q = VADD(T27, T28);
Chris@82 411 T2r = VFMA(LDK(KP923879532), T2q, T2p);
Chris@82 412 T2J = VFNMS(LDK(KP923879532), T2q, T2p);
Chris@82 413 T2D = VFNMS(LDK(KP668178637), T2s, T2t);
Chris@82 414 T2E = VFNMS(LDK(KP668178637), T2v, T2w);
Chris@82 415 T2F = VSUB(T2D, T2E);
Chris@82 416 T2K = VADD(T2D, T2E);
Chris@82 417 }
Chris@82 418 {
Chris@82 419 V T2u, T2x, T2A, T2B;
Chris@82 420 T2u = VFMA(LDK(KP668178637), T2t, T2s);
Chris@82 421 T2x = VFMA(LDK(KP668178637), T2w, T2v);
Chris@82 422 T2y = VADD(T2u, T2x);
Chris@82 423 T2N = VSUB(T2x, T2u);
Chris@82 424 T2A = VFMA(LDK(KP707106781), T25, T24);
Chris@82 425 T2B = VSUB(TX, TG);
Chris@82 426 T2C = VFMA(LDK(KP923879532), T2B, T2A);
Chris@82 427 T2M = VFNMS(LDK(KP923879532), T2B, T2A);
Chris@82 428 }
Chris@82 429 {
Chris@82 430 V T2z, T2G, T2P, T2Q;
Chris@82 431 T2z = VFNMS(LDK(KP831469612), T2y, T2r);
Chris@82 432 T2G = VFNMS(LDK(KP831469612), T2F, T2C);
Chris@82 433 ST(&(x[WS(rs, 13)]), VFNMSI(T2G, T2z), ms, &(x[WS(rs, 1)]));
Chris@82 434 ST(&(x[WS(rs, 19)]), VFMAI(T2G, T2z), ms, &(x[WS(rs, 1)]));
Chris@82 435 T2P = VFNMS(LDK(KP831469612), T2K, T2J);
Chris@82 436 T2Q = VFNMS(LDK(KP831469612), T2N, T2M);
Chris@82 437 ST(&(x[WS(rs, 5)]), VFNMSI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@82 438 ST(&(x[WS(rs, 27)]), VFMAI(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@82 439 }
Chris@82 440 {
Chris@82 441 V T2H, T2I, T2L, T2O;
Chris@82 442 T2H = VFMA(LDK(KP831469612), T2y, T2r);
Chris@82 443 T2I = VFMA(LDK(KP831469612), T2F, T2C);
Chris@82 444 ST(&(x[WS(rs, 29)]), VFNMSI(T2I, T2H), ms, &(x[WS(rs, 1)]));
Chris@82 445 ST(&(x[WS(rs, 3)]), VFMAI(T2I, T2H), ms, &(x[WS(rs, 1)]));
Chris@82 446 T2L = VFMA(LDK(KP831469612), T2K, T2J);
Chris@82 447 T2O = VFMA(LDK(KP831469612), T2N, T2M);
Chris@82 448 ST(&(x[WS(rs, 11)]), VFMAI(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@82 449 ST(&(x[WS(rs, 21)]), VFNMSI(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@82 450 }
Chris@82 451 }
Chris@82 452 }
Chris@82 453 }
Chris@82 454 }
Chris@82 455 VLEAVE();
Chris@82 456 }
Chris@82 457
Chris@82 458 static const tw_instr twinstr[] = {
Chris@82 459 VTW(0, 1),
Chris@82 460 VTW(0, 3),
Chris@82 461 VTW(0, 9),
Chris@82 462 VTW(0, 27),
Chris@82 463 {TW_NEXT, VL, 0}
Chris@82 464 };
Chris@82 465
Chris@82 466 static const ct_desc desc = { 32, XSIMD_STRING("t3fv_32"), twinstr, &GENUS, {146, 116, 98, 0}, 0, 0, 0 };
Chris@82 467
Chris@82 468 void XSIMD(codelet_t3fv_32) (planner *p) {
Chris@82 469 X(kdft_dit_register) (p, t3fv_32, &desc);
Chris@82 470 }
Chris@82 471 #else
Chris@82 472
Chris@82 473 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 32 -name t3fv_32 -include dft/simd/t3f.h */
Chris@82 474
Chris@82 475 /*
Chris@82 476 * This function contains 244 FP additions, 158 FP multiplications,
Chris@82 477 * (or, 228 additions, 142 multiplications, 16 fused multiply/add),
Chris@82 478 * 90 stack variables, 7 constants, and 64 memory accesses
Chris@82 479 */
Chris@82 480 #include "dft/simd/t3f.h"
Chris@82 481
Chris@82 482 static void t3fv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 483 {
Chris@82 484 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 485 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 486 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 487 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 488 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 489 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 490 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 491 {
Chris@82 492 INT m;
Chris@82 493 R *x;
Chris@82 494 x = ri;
Chris@82 495 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 496 V T2, T5, T3, T4, Tc, T1C, TP, Tz, Tn, T6, TS, Tf, TK, T7, T8;
Chris@82 497 V Tv, T1w, T22, Tg, Tk, T1D, T1R, TC, T18, T12, T1t, TH, TL, TT, T1n;
Chris@82 498 V T1c;
Chris@82 499 T2 = LDW(&(W[0]));
Chris@82 500 T5 = LDW(&(W[TWVL * 4]));
Chris@82 501 T3 = LDW(&(W[TWVL * 2]));
Chris@82 502 T4 = VZMULJ(T2, T3);
Chris@82 503 Tc = VZMUL(T2, T3);
Chris@82 504 T1C = VZMULJ(T2, T5);
Chris@82 505 TP = VZMULJ(T3, T5);
Chris@82 506 Tz = VZMUL(T2, T5);
Chris@82 507 Tn = VZMUL(T3, T5);
Chris@82 508 T6 = VZMUL(T4, T5);
Chris@82 509 TS = VZMULJ(Tc, T5);
Chris@82 510 Tf = VZMULJ(T4, T5);
Chris@82 511 TK = VZMUL(Tc, T5);
Chris@82 512 T7 = LDW(&(W[TWVL * 6]));
Chris@82 513 T8 = VZMULJ(T6, T7);
Chris@82 514 Tv = VZMULJ(T5, T7);
Chris@82 515 T1w = VZMULJ(Tn, T7);
Chris@82 516 T22 = VZMULJ(T3, T7);
Chris@82 517 Tg = VZMULJ(Tf, T7);
Chris@82 518 Tk = VZMUL(T2, T7);
Chris@82 519 T1D = VZMULJ(T1C, T7);
Chris@82 520 T1R = VZMULJ(Tc, T7);
Chris@82 521 TC = VZMULJ(T2, T7);
Chris@82 522 T18 = VZMULJ(TP, T7);
Chris@82 523 T12 = VZMULJ(Tz, T7);
Chris@82 524 T1t = VZMUL(Tc, T7);
Chris@82 525 TH = VZMUL(T3, T7);
Chris@82 526 TL = VZMULJ(TK, T7);
Chris@82 527 TT = VZMULJ(TS, T7);
Chris@82 528 T1n = VZMULJ(T4, T7);
Chris@82 529 T1c = VZMUL(T4, T7);
Chris@82 530 {
Chris@82 531 V Tb, T25, T2T, T3x, Tr, T1Z, T2W, T3K, TX, T27, T3g, T3z, TG, T28, T3j;
Chris@82 532 V T3y, T1N, T2v, T3a, T3G, T1V, T2w, T37, T3F, T1j, T2s, T33, T3D, T1r, T2t;
Chris@82 533 V T30, T3C;
Chris@82 534 {
Chris@82 535 V T1, T24, Ta, T21, T23, T9, T20, T2R, T2S;
Chris@82 536 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 537 T23 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@82 538 T24 = VZMULJ(T22, T23);
Chris@82 539 T9 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 540 Ta = VZMULJ(T8, T9);
Chris@82 541 T20 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 542 T21 = VZMULJ(T1C, T20);
Chris@82 543 Tb = VSUB(T1, Ta);
Chris@82 544 T25 = VSUB(T21, T24);
Chris@82 545 T2R = VADD(T1, Ta);
Chris@82 546 T2S = VADD(T21, T24);
Chris@82 547 T2T = VADD(T2R, T2S);
Chris@82 548 T3x = VSUB(T2R, T2S);
Chris@82 549 }
Chris@82 550 {
Chris@82 551 V Te, Tp, Ti, Tm;
Chris@82 552 {
Chris@82 553 V Td, To, Th, Tl;
Chris@82 554 Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 555 Te = VZMULJ(Tc, Td);
Chris@82 556 To = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 557 Tp = VZMULJ(Tn, To);
Chris@82 558 Th = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@82 559 Ti = VZMULJ(Tg, Th);
Chris@82 560 Tl = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@82 561 Tm = VZMULJ(Tk, Tl);
Chris@82 562 }
Chris@82 563 {
Chris@82 564 V Tj, Tq, T2U, T2V;
Chris@82 565 Tj = VSUB(Te, Ti);
Chris@82 566 Tq = VSUB(Tm, Tp);
Chris@82 567 Tr = VMUL(LDK(KP707106781), VADD(Tj, Tq));
Chris@82 568 T1Z = VMUL(LDK(KP707106781), VSUB(Tq, Tj));
Chris@82 569 T2U = VADD(Te, Ti);
Chris@82 570 T2V = VADD(Tm, Tp);
Chris@82 571 T2W = VADD(T2U, T2V);
Chris@82 572 T3K = VSUB(T2V, T2U);
Chris@82 573 }
Chris@82 574 }
Chris@82 575 {
Chris@82 576 V TJ, TV, TN, TR;
Chris@82 577 {
Chris@82 578 V TI, TU, TM, TQ;
Chris@82 579 TI = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@82 580 TJ = VZMULJ(TH, TI);
Chris@82 581 TU = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@82 582 TV = VZMULJ(TT, TU);
Chris@82 583 TM = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 584 TN = VZMULJ(TL, TM);
Chris@82 585 TQ = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 586 TR = VZMULJ(TP, TQ);
Chris@82 587 }
Chris@82 588 {
Chris@82 589 V TO, TW, T3e, T3f;
Chris@82 590 TO = VSUB(TJ, TN);
Chris@82 591 TW = VSUB(TR, TV);
Chris@82 592 TX = VFMA(LDK(KP923879532), TO, VMUL(LDK(KP382683432), TW));
Chris@82 593 T27 = VFNMS(LDK(KP923879532), TW, VMUL(LDK(KP382683432), TO));
Chris@82 594 T3e = VADD(TJ, TN);
Chris@82 595 T3f = VADD(TR, TV);
Chris@82 596 T3g = VADD(T3e, T3f);
Chris@82 597 T3z = VSUB(T3e, T3f);
Chris@82 598 }
Chris@82 599 }
Chris@82 600 {
Chris@82 601 V Tu, TE, Tx, TB;
Chris@82 602 {
Chris@82 603 V Tt, TD, Tw, TA;
Chris@82 604 Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 605 Tu = VZMULJ(T4, Tt);
Chris@82 606 TD = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@82 607 TE = VZMULJ(TC, TD);
Chris@82 608 Tw = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 609 Tx = VZMULJ(Tv, Tw);
Chris@82 610 TA = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 611 TB = VZMULJ(Tz, TA);
Chris@82 612 }
Chris@82 613 {
Chris@82 614 V Ty, TF, T3h, T3i;
Chris@82 615 Ty = VSUB(Tu, Tx);
Chris@82 616 TF = VSUB(TB, TE);
Chris@82 617 TG = VFNMS(LDK(KP382683432), TF, VMUL(LDK(KP923879532), Ty));
Chris@82 618 T28 = VFMA(LDK(KP382683432), Ty, VMUL(LDK(KP923879532), TF));
Chris@82 619 T3h = VADD(Tu, Tx);
Chris@82 620 T3i = VADD(TB, TE);
Chris@82 621 T3j = VADD(T3h, T3i);
Chris@82 622 T3y = VSUB(T3h, T3i);
Chris@82 623 }
Chris@82 624 }
Chris@82 625 {
Chris@82 626 V T1v, T1y, T1T, T1Q, T1I, T1K, T1L, T1B, T1F, T1G;
Chris@82 627 {
Chris@82 628 V T1u, T1x, T1S, T1P;
Chris@82 629 T1u = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@82 630 T1v = VZMULJ(T1t, T1u);
Chris@82 631 T1x = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 632 T1y = VZMULJ(T1w, T1x);
Chris@82 633 T1S = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@82 634 T1T = VZMULJ(T1R, T1S);
Chris@82 635 T1P = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 636 T1Q = VZMULJ(Tf, T1P);
Chris@82 637 {
Chris@82 638 V T1H, T1J, T1A, T1E;
Chris@82 639 T1H = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@82 640 T1I = VZMULJ(T7, T1H);
Chris@82 641 T1J = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 642 T1K = VZMULJ(T6, T1J);
Chris@82 643 T1L = VSUB(T1I, T1K);
Chris@82 644 T1A = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 645 T1B = VZMULJ(T3, T1A);
Chris@82 646 T1E = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 647 T1F = VZMULJ(T1D, T1E);
Chris@82 648 T1G = VSUB(T1B, T1F);
Chris@82 649 }
Chris@82 650 }
Chris@82 651 {
Chris@82 652 V T1z, T1M, T38, T39;
Chris@82 653 T1z = VSUB(T1v, T1y);
Chris@82 654 T1M = VMUL(LDK(KP707106781), VADD(T1G, T1L));
Chris@82 655 T1N = VADD(T1z, T1M);
Chris@82 656 T2v = VSUB(T1z, T1M);
Chris@82 657 T38 = VADD(T1B, T1F);
Chris@82 658 T39 = VADD(T1I, T1K);
Chris@82 659 T3a = VADD(T38, T39);
Chris@82 660 T3G = VSUB(T39, T38);
Chris@82 661 }
Chris@82 662 {
Chris@82 663 V T1O, T1U, T35, T36;
Chris@82 664 T1O = VMUL(LDK(KP707106781), VSUB(T1L, T1G));
Chris@82 665 T1U = VSUB(T1Q, T1T);
Chris@82 666 T1V = VSUB(T1O, T1U);
Chris@82 667 T2w = VADD(T1U, T1O);
Chris@82 668 T35 = VADD(T1v, T1y);
Chris@82 669 T36 = VADD(T1Q, T1T);
Chris@82 670 T37 = VADD(T35, T36);
Chris@82 671 T3F = VSUB(T35, T36);
Chris@82 672 }
Chris@82 673 }
Chris@82 674 {
Chris@82 675 V T11, T14, T1p, T1m, T1e, T1g, T1h, T17, T1a, T1b;
Chris@82 676 {
Chris@82 677 V T10, T13, T1o, T1l;
Chris@82 678 T10 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 679 T11 = VZMULJ(T2, T10);
Chris@82 680 T13 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 681 T14 = VZMULJ(T12, T13);
Chris@82 682 T1o = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@82 683 T1p = VZMULJ(T1n, T1o);
Chris@82 684 T1l = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 685 T1m = VZMULJ(T5, T1l);
Chris@82 686 {
Chris@82 687 V T1d, T1f, T16, T19;
Chris@82 688 T1d = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@82 689 T1e = VZMULJ(T1c, T1d);
Chris@82 690 T1f = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 691 T1g = VZMULJ(TK, T1f);
Chris@82 692 T1h = VSUB(T1e, T1g);
Chris@82 693 T16 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 694 T17 = VZMULJ(TS, T16);
Chris@82 695 T19 = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@82 696 T1a = VZMULJ(T18, T19);
Chris@82 697 T1b = VSUB(T17, T1a);
Chris@82 698 }
Chris@82 699 }
Chris@82 700 {
Chris@82 701 V T15, T1i, T31, T32;
Chris@82 702 T15 = VSUB(T11, T14);
Chris@82 703 T1i = VMUL(LDK(KP707106781), VADD(T1b, T1h));
Chris@82 704 T1j = VADD(T15, T1i);
Chris@82 705 T2s = VSUB(T15, T1i);
Chris@82 706 T31 = VADD(T17, T1a);
Chris@82 707 T32 = VADD(T1e, T1g);
Chris@82 708 T33 = VADD(T31, T32);
Chris@82 709 T3D = VSUB(T32, T31);
Chris@82 710 }
Chris@82 711 {
Chris@82 712 V T1k, T1q, T2Y, T2Z;
Chris@82 713 T1k = VMUL(LDK(KP707106781), VSUB(T1h, T1b));
Chris@82 714 T1q = VSUB(T1m, T1p);
Chris@82 715 T1r = VSUB(T1k, T1q);
Chris@82 716 T2t = VADD(T1q, T1k);
Chris@82 717 T2Y = VADD(T11, T14);
Chris@82 718 T2Z = VADD(T1m, T1p);
Chris@82 719 T30 = VADD(T2Y, T2Z);
Chris@82 720 T3C = VSUB(T2Y, T2Z);
Chris@82 721 }
Chris@82 722 }
Chris@82 723 {
Chris@82 724 V T3r, T3v, T3u, T3w;
Chris@82 725 {
Chris@82 726 V T3p, T3q, T3s, T3t;
Chris@82 727 T3p = VADD(T2T, T2W);
Chris@82 728 T3q = VADD(T3j, T3g);
Chris@82 729 T3r = VADD(T3p, T3q);
Chris@82 730 T3v = VSUB(T3p, T3q);
Chris@82 731 T3s = VADD(T30, T33);
Chris@82 732 T3t = VADD(T37, T3a);
Chris@82 733 T3u = VADD(T3s, T3t);
Chris@82 734 T3w = VBYI(VSUB(T3t, T3s));
Chris@82 735 }
Chris@82 736 ST(&(x[WS(rs, 16)]), VSUB(T3r, T3u), ms, &(x[0]));
Chris@82 737 ST(&(x[WS(rs, 8)]), VADD(T3v, T3w), ms, &(x[0]));
Chris@82 738 ST(&(x[0]), VADD(T3r, T3u), ms, &(x[0]));
Chris@82 739 ST(&(x[WS(rs, 24)]), VSUB(T3v, T3w), ms, &(x[0]));
Chris@82 740 }
Chris@82 741 {
Chris@82 742 V T2X, T3k, T3c, T3l, T34, T3b;
Chris@82 743 T2X = VSUB(T2T, T2W);
Chris@82 744 T3k = VSUB(T3g, T3j);
Chris@82 745 T34 = VSUB(T30, T33);
Chris@82 746 T3b = VSUB(T37, T3a);
Chris@82 747 T3c = VMUL(LDK(KP707106781), VADD(T34, T3b));
Chris@82 748 T3l = VMUL(LDK(KP707106781), VSUB(T3b, T34));
Chris@82 749 {
Chris@82 750 V T3d, T3m, T3n, T3o;
Chris@82 751 T3d = VADD(T2X, T3c);
Chris@82 752 T3m = VBYI(VADD(T3k, T3l));
Chris@82 753 ST(&(x[WS(rs, 28)]), VSUB(T3d, T3m), ms, &(x[0]));
Chris@82 754 ST(&(x[WS(rs, 4)]), VADD(T3d, T3m), ms, &(x[0]));
Chris@82 755 T3n = VSUB(T2X, T3c);
Chris@82 756 T3o = VBYI(VSUB(T3l, T3k));
Chris@82 757 ST(&(x[WS(rs, 20)]), VSUB(T3n, T3o), ms, &(x[0]));
Chris@82 758 ST(&(x[WS(rs, 12)]), VADD(T3n, T3o), ms, &(x[0]));
Chris@82 759 }
Chris@82 760 }
Chris@82 761 {
Chris@82 762 V T3B, T3W, T3M, T3U, T3I, T3T, T3P, T3X, T3A, T3L;
Chris@82 763 T3A = VMUL(LDK(KP707106781), VADD(T3y, T3z));
Chris@82 764 T3B = VADD(T3x, T3A);
Chris@82 765 T3W = VSUB(T3x, T3A);
Chris@82 766 T3L = VMUL(LDK(KP707106781), VSUB(T3z, T3y));
Chris@82 767 T3M = VADD(T3K, T3L);
Chris@82 768 T3U = VSUB(T3L, T3K);
Chris@82 769 {
Chris@82 770 V T3E, T3H, T3N, T3O;
Chris@82 771 T3E = VFMA(LDK(KP923879532), T3C, VMUL(LDK(KP382683432), T3D));
Chris@82 772 T3H = VFNMS(LDK(KP382683432), T3G, VMUL(LDK(KP923879532), T3F));
Chris@82 773 T3I = VADD(T3E, T3H);
Chris@82 774 T3T = VSUB(T3H, T3E);
Chris@82 775 T3N = VFNMS(LDK(KP382683432), T3C, VMUL(LDK(KP923879532), T3D));
Chris@82 776 T3O = VFMA(LDK(KP382683432), T3F, VMUL(LDK(KP923879532), T3G));
Chris@82 777 T3P = VADD(T3N, T3O);
Chris@82 778 T3X = VSUB(T3O, T3N);
Chris@82 779 }
Chris@82 780 {
Chris@82 781 V T3J, T3Q, T3Z, T40;
Chris@82 782 T3J = VADD(T3B, T3I);
Chris@82 783 T3Q = VBYI(VADD(T3M, T3P));
Chris@82 784 ST(&(x[WS(rs, 30)]), VSUB(T3J, T3Q), ms, &(x[0]));
Chris@82 785 ST(&(x[WS(rs, 2)]), VADD(T3J, T3Q), ms, &(x[0]));
Chris@82 786 T3Z = VBYI(VADD(T3U, T3T));
Chris@82 787 T40 = VADD(T3W, T3X);
Chris@82 788 ST(&(x[WS(rs, 6)]), VADD(T3Z, T40), ms, &(x[0]));
Chris@82 789 ST(&(x[WS(rs, 26)]), VSUB(T40, T3Z), ms, &(x[0]));
Chris@82 790 }
Chris@82 791 {
Chris@82 792 V T3R, T3S, T3V, T3Y;
Chris@82 793 T3R = VSUB(T3B, T3I);
Chris@82 794 T3S = VBYI(VSUB(T3P, T3M));
Chris@82 795 ST(&(x[WS(rs, 18)]), VSUB(T3R, T3S), ms, &(x[0]));
Chris@82 796 ST(&(x[WS(rs, 14)]), VADD(T3R, T3S), ms, &(x[0]));
Chris@82 797 T3V = VBYI(VSUB(T3T, T3U));
Chris@82 798 T3Y = VSUB(T3W, T3X);
Chris@82 799 ST(&(x[WS(rs, 10)]), VADD(T3V, T3Y), ms, &(x[0]));
Chris@82 800 ST(&(x[WS(rs, 22)]), VSUB(T3Y, T3V), ms, &(x[0]));
Chris@82 801 }
Chris@82 802 }
Chris@82 803 {
Chris@82 804 V TZ, T2k, T2d, T2l, T1X, T2h, T2a, T2i;
Chris@82 805 {
Chris@82 806 V Ts, TY, T2b, T2c;
Chris@82 807 Ts = VADD(Tb, Tr);
Chris@82 808 TY = VADD(TG, TX);
Chris@82 809 TZ = VADD(Ts, TY);
Chris@82 810 T2k = VSUB(Ts, TY);
Chris@82 811 T2b = VFNMS(LDK(KP195090322), T1j, VMUL(LDK(KP980785280), T1r));
Chris@82 812 T2c = VFMA(LDK(KP195090322), T1N, VMUL(LDK(KP980785280), T1V));
Chris@82 813 T2d = VADD(T2b, T2c);
Chris@82 814 T2l = VSUB(T2c, T2b);
Chris@82 815 }
Chris@82 816 {
Chris@82 817 V T1s, T1W, T26, T29;
Chris@82 818 T1s = VFMA(LDK(KP980785280), T1j, VMUL(LDK(KP195090322), T1r));
Chris@82 819 T1W = VFNMS(LDK(KP195090322), T1V, VMUL(LDK(KP980785280), T1N));
Chris@82 820 T1X = VADD(T1s, T1W);
Chris@82 821 T2h = VSUB(T1W, T1s);
Chris@82 822 T26 = VSUB(T1Z, T25);
Chris@82 823 T29 = VSUB(T27, T28);
Chris@82 824 T2a = VADD(T26, T29);
Chris@82 825 T2i = VSUB(T29, T26);
Chris@82 826 }
Chris@82 827 {
Chris@82 828 V T1Y, T2e, T2n, T2o;
Chris@82 829 T1Y = VADD(TZ, T1X);
Chris@82 830 T2e = VBYI(VADD(T2a, T2d));
Chris@82 831 ST(&(x[WS(rs, 31)]), VSUB(T1Y, T2e), ms, &(x[WS(rs, 1)]));
Chris@82 832 ST(&(x[WS(rs, 1)]), VADD(T1Y, T2e), ms, &(x[WS(rs, 1)]));
Chris@82 833 T2n = VBYI(VADD(T2i, T2h));
Chris@82 834 T2o = VADD(T2k, T2l);
Chris@82 835 ST(&(x[WS(rs, 7)]), VADD(T2n, T2o), ms, &(x[WS(rs, 1)]));
Chris@82 836 ST(&(x[WS(rs, 25)]), VSUB(T2o, T2n), ms, &(x[WS(rs, 1)]));
Chris@82 837 }
Chris@82 838 {
Chris@82 839 V T2f, T2g, T2j, T2m;
Chris@82 840 T2f = VSUB(TZ, T1X);
Chris@82 841 T2g = VBYI(VSUB(T2d, T2a));
Chris@82 842 ST(&(x[WS(rs, 17)]), VSUB(T2f, T2g), ms, &(x[WS(rs, 1)]));
Chris@82 843 ST(&(x[WS(rs, 15)]), VADD(T2f, T2g), ms, &(x[WS(rs, 1)]));
Chris@82 844 T2j = VBYI(VSUB(T2h, T2i));
Chris@82 845 T2m = VSUB(T2k, T2l);
Chris@82 846 ST(&(x[WS(rs, 9)]), VADD(T2j, T2m), ms, &(x[WS(rs, 1)]));
Chris@82 847 ST(&(x[WS(rs, 23)]), VSUB(T2m, T2j), ms, &(x[WS(rs, 1)]));
Chris@82 848 }
Chris@82 849 }
Chris@82 850 {
Chris@82 851 V T2r, T2M, T2F, T2N, T2y, T2J, T2C, T2K;
Chris@82 852 {
Chris@82 853 V T2p, T2q, T2D, T2E;
Chris@82 854 T2p = VSUB(Tb, Tr);
Chris@82 855 T2q = VADD(T28, T27);
Chris@82 856 T2r = VADD(T2p, T2q);
Chris@82 857 T2M = VSUB(T2p, T2q);
Chris@82 858 T2D = VFNMS(LDK(KP555570233), T2s, VMUL(LDK(KP831469612), T2t));
Chris@82 859 T2E = VFMA(LDK(KP555570233), T2v, VMUL(LDK(KP831469612), T2w));
Chris@82 860 T2F = VADD(T2D, T2E);
Chris@82 861 T2N = VSUB(T2E, T2D);
Chris@82 862 }
Chris@82 863 {
Chris@82 864 V T2u, T2x, T2A, T2B;
Chris@82 865 T2u = VFMA(LDK(KP831469612), T2s, VMUL(LDK(KP555570233), T2t));
Chris@82 866 T2x = VFNMS(LDK(KP555570233), T2w, VMUL(LDK(KP831469612), T2v));
Chris@82 867 T2y = VADD(T2u, T2x);
Chris@82 868 T2J = VSUB(T2x, T2u);
Chris@82 869 T2A = VADD(T25, T1Z);
Chris@82 870 T2B = VSUB(TX, TG);
Chris@82 871 T2C = VADD(T2A, T2B);
Chris@82 872 T2K = VSUB(T2B, T2A);
Chris@82 873 }
Chris@82 874 {
Chris@82 875 V T2z, T2G, T2P, T2Q;
Chris@82 876 T2z = VADD(T2r, T2y);
Chris@82 877 T2G = VBYI(VADD(T2C, T2F));
Chris@82 878 ST(&(x[WS(rs, 29)]), VSUB(T2z, T2G), ms, &(x[WS(rs, 1)]));
Chris@82 879 ST(&(x[WS(rs, 3)]), VADD(T2z, T2G), ms, &(x[WS(rs, 1)]));
Chris@82 880 T2P = VBYI(VADD(T2K, T2J));
Chris@82 881 T2Q = VADD(T2M, T2N);
Chris@82 882 ST(&(x[WS(rs, 5)]), VADD(T2P, T2Q), ms, &(x[WS(rs, 1)]));
Chris@82 883 ST(&(x[WS(rs, 27)]), VSUB(T2Q, T2P), ms, &(x[WS(rs, 1)]));
Chris@82 884 }
Chris@82 885 {
Chris@82 886 V T2H, T2I, T2L, T2O;
Chris@82 887 T2H = VSUB(T2r, T2y);
Chris@82 888 T2I = VBYI(VSUB(T2F, T2C));
Chris@82 889 ST(&(x[WS(rs, 19)]), VSUB(T2H, T2I), ms, &(x[WS(rs, 1)]));
Chris@82 890 ST(&(x[WS(rs, 13)]), VADD(T2H, T2I), ms, &(x[WS(rs, 1)]));
Chris@82 891 T2L = VBYI(VSUB(T2J, T2K));
Chris@82 892 T2O = VSUB(T2M, T2N);
Chris@82 893 ST(&(x[WS(rs, 11)]), VADD(T2L, T2O), ms, &(x[WS(rs, 1)]));
Chris@82 894 ST(&(x[WS(rs, 21)]), VSUB(T2O, T2L), ms, &(x[WS(rs, 1)]));
Chris@82 895 }
Chris@82 896 }
Chris@82 897 }
Chris@82 898 }
Chris@82 899 }
Chris@82 900 VLEAVE();
Chris@82 901 }
Chris@82 902
Chris@82 903 static const tw_instr twinstr[] = {
Chris@82 904 VTW(0, 1),
Chris@82 905 VTW(0, 3),
Chris@82 906 VTW(0, 9),
Chris@82 907 VTW(0, 27),
Chris@82 908 {TW_NEXT, VL, 0}
Chris@82 909 };
Chris@82 910
Chris@82 911 static const ct_desc desc = { 32, XSIMD_STRING("t3fv_32"), twinstr, &GENUS, {228, 142, 16, 0}, 0, 0, 0 };
Chris@82 912
Chris@82 913 void XSIMD(codelet_t3fv_32) (planner *p) {
Chris@82 914 X(kdft_dit_register) (p, t3fv_32, &desc);
Chris@82 915 }
Chris@82 916 #endif