annotate src/fftw-3.3.8/dft/scalar/codelets/t2_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:26 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -name t2_20 -include dft/scalar/t.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 276 FP additions, 198 FP multiplications,
Chris@82 32 * (or, 136 additions, 58 multiplications, 140 fused multiply/add),
Chris@82 33 * 95 stack variables, 4 constants, and 80 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/t.h"
Chris@82 36
Chris@82 37 static void t2_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 46 E T2, Th, Tf, T6, T5, Ti, Tl, T1n, T3, Tt, Tv, T7, T17, T1L, T24;
Chris@82 47 E Tb, T13, T1P, T21, T1b, T1D, T1A, T1H, T1f, TA, Tw, Tq, Tm, TK, T1S;
Chris@82 48 E TO, T1p, T1q, T1u, T2n, T2k, T2h, T2d;
Chris@82 49 {
Chris@82 50 E Tk, Ta, T1e, T4, T1a, Tj, T12, T1G, T16, T1K, Tg, Tz;
Chris@82 51 T2 = W[0];
Chris@82 52 Th = W[3];
Chris@82 53 Tf = W[2];
Chris@82 54 Tg = T2 * Tf;
Chris@82 55 Tk = T2 * Th;
Chris@82 56 T6 = W[5];
Chris@82 57 Ta = T2 * T6;
Chris@82 58 T1e = Tf * T6;
Chris@82 59 T5 = W[1];
Chris@82 60 Ti = FNMS(T5, Th, Tg);
Chris@82 61 Tl = FMA(T5, Tf, Tk);
Chris@82 62 T1n = FMA(T5, Th, Tg);
Chris@82 63 T3 = W[4];
Chris@82 64 T4 = T2 * T3;
Chris@82 65 T1a = Tf * T3;
Chris@82 66 Tj = Ti * T3;
Chris@82 67 Tt = W[6];
Chris@82 68 T12 = Tf * Tt;
Chris@82 69 T1G = T2 * Tt;
Chris@82 70 Tv = W[7];
Chris@82 71 T16 = Tf * Tv;
Chris@82 72 T1K = T2 * Tv;
Chris@82 73 T7 = FNMS(T5, T6, T4);
Chris@82 74 T17 = FNMS(Th, Tt, T16);
Chris@82 75 T1L = FNMS(T5, Tt, T1K);
Chris@82 76 T24 = FMA(Th, T3, T1e);
Chris@82 77 Tb = FMA(T5, T3, Ta);
Chris@82 78 T13 = FMA(Th, Tv, T12);
Chris@82 79 T1P = FNMS(Tl, T6, Tj);
Chris@82 80 T21 = FNMS(Th, T6, T1a);
Chris@82 81 T1b = FMA(Th, T6, T1a);
Chris@82 82 T1D = FNMS(T5, T3, Ta);
Chris@82 83 T1A = FMA(T5, T6, T4);
Chris@82 84 T1H = FMA(T5, Tv, T1G);
Chris@82 85 T1f = FNMS(Th, T3, T1e);
Chris@82 86 Tz = Ti * Tv;
Chris@82 87 TA = FNMS(Tl, Tt, Tz);
Chris@82 88 {
Chris@82 89 E Tu, Tp, TJ, TN;
Chris@82 90 Tu = Ti * Tt;
Chris@82 91 Tw = FMA(Tl, Tv, Tu);
Chris@82 92 Tp = Ti * T6;
Chris@82 93 Tq = FNMS(Tl, T3, Tp);
Chris@82 94 Tm = FMA(Tl, T6, Tj);
Chris@82 95 TJ = Tm * Tt;
Chris@82 96 TN = Tm * Tv;
Chris@82 97 TK = FMA(Tq, Tv, TJ);
Chris@82 98 T1S = FMA(Tl, T3, Tp);
Chris@82 99 TO = FNMS(Tq, Tt, TN);
Chris@82 100 {
Chris@82 101 E T1o, T2g, T1t, T2c;
Chris@82 102 T1o = T1n * T3;
Chris@82 103 T2g = T1n * Tv;
Chris@82 104 T1t = T1n * T6;
Chris@82 105 T2c = T1n * Tt;
Chris@82 106 T1p = FNMS(T5, Tf, Tk);
Chris@82 107 T1q = FNMS(T1p, T6, T1o);
Chris@82 108 T1u = FMA(T1p, T3, T1t);
Chris@82 109 T2n = FNMS(T1p, T3, T1t);
Chris@82 110 T2k = FMA(T1p, T6, T1o);
Chris@82 111 T2h = FNMS(T1p, Tt, T2g);
Chris@82 112 T2d = FMA(T1p, Tv, T2c);
Chris@82 113 }
Chris@82 114 }
Chris@82 115 }
Chris@82 116 {
Chris@82 117 E Te, T2C, T4L, T57, TD, T58, T2H, T4H, T11, T2v, T4k, T4v, T2P, T3P, T3C;
Chris@82 118 E T3Z, T2r, T2z, T4g, T4z, T3b, T3T, T3u, T43, T20, T2y, T4d, T4y, T34, T3S;
Chris@82 119 E T3n, T42, T1y, T2w, T4n, T4w, T2W, T3Q, T3J, T40;
Chris@82 120 {
Chris@82 121 E T1, T4K, T8, T9, Tc, T4I, Td, T4J;
Chris@82 122 T1 = ri[0];
Chris@82 123 T4K = ii[0];
Chris@82 124 T8 = ri[WS(rs, 10)];
Chris@82 125 T9 = T7 * T8;
Chris@82 126 Tc = ii[WS(rs, 10)];
Chris@82 127 T4I = T7 * Tc;
Chris@82 128 Td = FMA(Tb, Tc, T9);
Chris@82 129 Te = T1 + Td;
Chris@82 130 T2C = T1 - Td;
Chris@82 131 T4J = FNMS(Tb, T8, T4I);
Chris@82 132 T4L = T4J + T4K;
Chris@82 133 T57 = T4K - T4J;
Chris@82 134 }
Chris@82 135 {
Chris@82 136 E Tn, To, Tr, T2D, Tx, Ty, TB, T2F;
Chris@82 137 Tn = ri[WS(rs, 5)];
Chris@82 138 To = Tm * Tn;
Chris@82 139 Tr = ii[WS(rs, 5)];
Chris@82 140 T2D = Tm * Tr;
Chris@82 141 Tx = ri[WS(rs, 15)];
Chris@82 142 Ty = Tw * Tx;
Chris@82 143 TB = ii[WS(rs, 15)];
Chris@82 144 T2F = Tw * TB;
Chris@82 145 {
Chris@82 146 E Ts, TC, T2E, T2G;
Chris@82 147 Ts = FMA(Tq, Tr, To);
Chris@82 148 TC = FMA(TA, TB, Ty);
Chris@82 149 TD = Ts + TC;
Chris@82 150 T58 = Ts - TC;
Chris@82 151 T2E = FNMS(Tq, Tn, T2D);
Chris@82 152 T2G = FNMS(TA, Tx, T2F);
Chris@82 153 T2H = T2E - T2G;
Chris@82 154 T4H = T2E + T2G;
Chris@82 155 }
Chris@82 156 }
Chris@82 157 {
Chris@82 158 E TI, T3x, TZ, T2N, TQ, T3z, TV, T2L;
Chris@82 159 {
Chris@82 160 E TF, TG, TH, T3w;
Chris@82 161 TF = ri[WS(rs, 4)];
Chris@82 162 TG = Ti * TF;
Chris@82 163 TH = ii[WS(rs, 4)];
Chris@82 164 T3w = Ti * TH;
Chris@82 165 TI = FMA(Tl, TH, TG);
Chris@82 166 T3x = FNMS(Tl, TF, T3w);
Chris@82 167 }
Chris@82 168 {
Chris@82 169 E TW, TX, TY, T2M;
Chris@82 170 TW = ri[WS(rs, 19)];
Chris@82 171 TX = Tt * TW;
Chris@82 172 TY = ii[WS(rs, 19)];
Chris@82 173 T2M = Tt * TY;
Chris@82 174 TZ = FMA(Tv, TY, TX);
Chris@82 175 T2N = FNMS(Tv, TW, T2M);
Chris@82 176 }
Chris@82 177 {
Chris@82 178 E TL, TM, TP, T3y;
Chris@82 179 TL = ri[WS(rs, 14)];
Chris@82 180 TM = TK * TL;
Chris@82 181 TP = ii[WS(rs, 14)];
Chris@82 182 T3y = TK * TP;
Chris@82 183 TQ = FMA(TO, TP, TM);
Chris@82 184 T3z = FNMS(TO, TL, T3y);
Chris@82 185 }
Chris@82 186 {
Chris@82 187 E TS, TT, TU, T2K;
Chris@82 188 TS = ri[WS(rs, 9)];
Chris@82 189 TT = T3 * TS;
Chris@82 190 TU = ii[WS(rs, 9)];
Chris@82 191 T2K = T3 * TU;
Chris@82 192 TV = FMA(T6, TU, TT);
Chris@82 193 T2L = FNMS(T6, TS, T2K);
Chris@82 194 }
Chris@82 195 {
Chris@82 196 E TR, T10, T4i, T4j;
Chris@82 197 TR = TI + TQ;
Chris@82 198 T10 = TV + TZ;
Chris@82 199 T11 = TR - T10;
Chris@82 200 T2v = TR + T10;
Chris@82 201 T4i = T3x + T3z;
Chris@82 202 T4j = T2L + T2N;
Chris@82 203 T4k = T4i - T4j;
Chris@82 204 T4v = T4i + T4j;
Chris@82 205 }
Chris@82 206 {
Chris@82 207 E T2J, T2O, T3A, T3B;
Chris@82 208 T2J = TI - TQ;
Chris@82 209 T2O = T2L - T2N;
Chris@82 210 T2P = T2J - T2O;
Chris@82 211 T3P = T2J + T2O;
Chris@82 212 T3A = T3x - T3z;
Chris@82 213 T3B = TV - TZ;
Chris@82 214 T3C = T3A + T3B;
Chris@82 215 T3Z = T3A - T3B;
Chris@82 216 }
Chris@82 217 }
Chris@82 218 {
Chris@82 219 E T26, T3p, T2p, T39, T2a, T3r, T2j, T37;
Chris@82 220 {
Chris@82 221 E T22, T23, T25, T3o;
Chris@82 222 T22 = ri[WS(rs, 12)];
Chris@82 223 T23 = T21 * T22;
Chris@82 224 T25 = ii[WS(rs, 12)];
Chris@82 225 T3o = T21 * T25;
Chris@82 226 T26 = FMA(T24, T25, T23);
Chris@82 227 T3p = FNMS(T24, T22, T3o);
Chris@82 228 }
Chris@82 229 {
Chris@82 230 E T2l, T2m, T2o, T38;
Chris@82 231 T2l = ri[WS(rs, 7)];
Chris@82 232 T2m = T2k * T2l;
Chris@82 233 T2o = ii[WS(rs, 7)];
Chris@82 234 T38 = T2k * T2o;
Chris@82 235 T2p = FMA(T2n, T2o, T2m);
Chris@82 236 T39 = FNMS(T2n, T2l, T38);
Chris@82 237 }
Chris@82 238 {
Chris@82 239 E T27, T28, T29, T3q;
Chris@82 240 T27 = ri[WS(rs, 2)];
Chris@82 241 T28 = T1n * T27;
Chris@82 242 T29 = ii[WS(rs, 2)];
Chris@82 243 T3q = T1n * T29;
Chris@82 244 T2a = FMA(T1p, T29, T28);
Chris@82 245 T3r = FNMS(T1p, T27, T3q);
Chris@82 246 }
Chris@82 247 {
Chris@82 248 E T2e, T2f, T2i, T36;
Chris@82 249 T2e = ri[WS(rs, 17)];
Chris@82 250 T2f = T2d * T2e;
Chris@82 251 T2i = ii[WS(rs, 17)];
Chris@82 252 T36 = T2d * T2i;
Chris@82 253 T2j = FMA(T2h, T2i, T2f);
Chris@82 254 T37 = FNMS(T2h, T2e, T36);
Chris@82 255 }
Chris@82 256 {
Chris@82 257 E T2b, T2q, T4e, T4f;
Chris@82 258 T2b = T26 + T2a;
Chris@82 259 T2q = T2j + T2p;
Chris@82 260 T2r = T2b - T2q;
Chris@82 261 T2z = T2b + T2q;
Chris@82 262 T4e = T3p + T3r;
Chris@82 263 T4f = T37 + T39;
Chris@82 264 T4g = T4e - T4f;
Chris@82 265 T4z = T4e + T4f;
Chris@82 266 }
Chris@82 267 {
Chris@82 268 E T35, T3a, T3s, T3t;
Chris@82 269 T35 = T26 - T2a;
Chris@82 270 T3a = T37 - T39;
Chris@82 271 T3b = T35 - T3a;
Chris@82 272 T3T = T35 + T3a;
Chris@82 273 T3s = T3p - T3r;
Chris@82 274 T3t = T2j - T2p;
Chris@82 275 T3u = T3s + T3t;
Chris@82 276 T43 = T3s - T3t;
Chris@82 277 }
Chris@82 278 }
Chris@82 279 {
Chris@82 280 E T1F, T3i, T1Y, T32, T1N, T3k, T1U, T30;
Chris@82 281 {
Chris@82 282 E T1B, T1C, T1E, T3h;
Chris@82 283 T1B = ri[WS(rs, 8)];
Chris@82 284 T1C = T1A * T1B;
Chris@82 285 T1E = ii[WS(rs, 8)];
Chris@82 286 T3h = T1A * T1E;
Chris@82 287 T1F = FMA(T1D, T1E, T1C);
Chris@82 288 T3i = FNMS(T1D, T1B, T3h);
Chris@82 289 }
Chris@82 290 {
Chris@82 291 E T1V, T1W, T1X, T31;
Chris@82 292 T1V = ri[WS(rs, 3)];
Chris@82 293 T1W = Tf * T1V;
Chris@82 294 T1X = ii[WS(rs, 3)];
Chris@82 295 T31 = Tf * T1X;
Chris@82 296 T1Y = FMA(Th, T1X, T1W);
Chris@82 297 T32 = FNMS(Th, T1V, T31);
Chris@82 298 }
Chris@82 299 {
Chris@82 300 E T1I, T1J, T1M, T3j;
Chris@82 301 T1I = ri[WS(rs, 18)];
Chris@82 302 T1J = T1H * T1I;
Chris@82 303 T1M = ii[WS(rs, 18)];
Chris@82 304 T3j = T1H * T1M;
Chris@82 305 T1N = FMA(T1L, T1M, T1J);
Chris@82 306 T3k = FNMS(T1L, T1I, T3j);
Chris@82 307 }
Chris@82 308 {
Chris@82 309 E T1Q, T1R, T1T, T2Z;
Chris@82 310 T1Q = ri[WS(rs, 13)];
Chris@82 311 T1R = T1P * T1Q;
Chris@82 312 T1T = ii[WS(rs, 13)];
Chris@82 313 T2Z = T1P * T1T;
Chris@82 314 T1U = FMA(T1S, T1T, T1R);
Chris@82 315 T30 = FNMS(T1S, T1Q, T2Z);
Chris@82 316 }
Chris@82 317 {
Chris@82 318 E T1O, T1Z, T4b, T4c;
Chris@82 319 T1O = T1F + T1N;
Chris@82 320 T1Z = T1U + T1Y;
Chris@82 321 T20 = T1O - T1Z;
Chris@82 322 T2y = T1O + T1Z;
Chris@82 323 T4b = T3i + T3k;
Chris@82 324 T4c = T30 + T32;
Chris@82 325 T4d = T4b - T4c;
Chris@82 326 T4y = T4b + T4c;
Chris@82 327 }
Chris@82 328 {
Chris@82 329 E T2Y, T33, T3l, T3m;
Chris@82 330 T2Y = T1F - T1N;
Chris@82 331 T33 = T30 - T32;
Chris@82 332 T34 = T2Y - T33;
Chris@82 333 T3S = T2Y + T33;
Chris@82 334 T3l = T3i - T3k;
Chris@82 335 T3m = T1U - T1Y;
Chris@82 336 T3n = T3l + T3m;
Chris@82 337 T42 = T3l - T3m;
Chris@82 338 }
Chris@82 339 }
Chris@82 340 {
Chris@82 341 E T19, T3E, T1w, T2U, T1h, T3G, T1m, T2S;
Chris@82 342 {
Chris@82 343 E T14, T15, T18, T3D;
Chris@82 344 T14 = ri[WS(rs, 16)];
Chris@82 345 T15 = T13 * T14;
Chris@82 346 T18 = ii[WS(rs, 16)];
Chris@82 347 T3D = T13 * T18;
Chris@82 348 T19 = FMA(T17, T18, T15);
Chris@82 349 T3E = FNMS(T17, T14, T3D);
Chris@82 350 }
Chris@82 351 {
Chris@82 352 E T1r, T1s, T1v, T2T;
Chris@82 353 T1r = ri[WS(rs, 11)];
Chris@82 354 T1s = T1q * T1r;
Chris@82 355 T1v = ii[WS(rs, 11)];
Chris@82 356 T2T = T1q * T1v;
Chris@82 357 T1w = FMA(T1u, T1v, T1s);
Chris@82 358 T2U = FNMS(T1u, T1r, T2T);
Chris@82 359 }
Chris@82 360 {
Chris@82 361 E T1c, T1d, T1g, T3F;
Chris@82 362 T1c = ri[WS(rs, 6)];
Chris@82 363 T1d = T1b * T1c;
Chris@82 364 T1g = ii[WS(rs, 6)];
Chris@82 365 T3F = T1b * T1g;
Chris@82 366 T1h = FMA(T1f, T1g, T1d);
Chris@82 367 T3G = FNMS(T1f, T1c, T3F);
Chris@82 368 }
Chris@82 369 {
Chris@82 370 E T1j, T1k, T1l, T2R;
Chris@82 371 T1j = ri[WS(rs, 1)];
Chris@82 372 T1k = T2 * T1j;
Chris@82 373 T1l = ii[WS(rs, 1)];
Chris@82 374 T2R = T2 * T1l;
Chris@82 375 T1m = FMA(T5, T1l, T1k);
Chris@82 376 T2S = FNMS(T5, T1j, T2R);
Chris@82 377 }
Chris@82 378 {
Chris@82 379 E T1i, T1x, T4l, T4m;
Chris@82 380 T1i = T19 + T1h;
Chris@82 381 T1x = T1m + T1w;
Chris@82 382 T1y = T1i - T1x;
Chris@82 383 T2w = T1i + T1x;
Chris@82 384 T4l = T3E + T3G;
Chris@82 385 T4m = T2S + T2U;
Chris@82 386 T4n = T4l - T4m;
Chris@82 387 T4w = T4l + T4m;
Chris@82 388 }
Chris@82 389 {
Chris@82 390 E T2Q, T2V, T3H, T3I;
Chris@82 391 T2Q = T19 - T1h;
Chris@82 392 T2V = T2S - T2U;
Chris@82 393 T2W = T2Q - T2V;
Chris@82 394 T3Q = T2Q + T2V;
Chris@82 395 T3H = T3E - T3G;
Chris@82 396 T3I = T1m - T1w;
Chris@82 397 T3J = T3H + T3I;
Chris@82 398 T40 = T3H - T3I;
Chris@82 399 }
Chris@82 400 }
Chris@82 401 {
Chris@82 402 E T4p, T4r, TE, T2t, T48, T49, T4q, T4a;
Chris@82 403 {
Chris@82 404 E T4h, T4o, T1z, T2s;
Chris@82 405 T4h = T4d - T4g;
Chris@82 406 T4o = T4k - T4n;
Chris@82 407 T4p = FNMS(KP618033988, T4o, T4h);
Chris@82 408 T4r = FMA(KP618033988, T4h, T4o);
Chris@82 409 TE = Te - TD;
Chris@82 410 T1z = T11 + T1y;
Chris@82 411 T2s = T20 + T2r;
Chris@82 412 T2t = T1z + T2s;
Chris@82 413 T48 = FNMS(KP250000000, T2t, TE);
Chris@82 414 T49 = T1z - T2s;
Chris@82 415 }
Chris@82 416 ri[WS(rs, 10)] = TE + T2t;
Chris@82 417 T4q = FMA(KP559016994, T49, T48);
Chris@82 418 ri[WS(rs, 14)] = FNMS(KP951056516, T4r, T4q);
Chris@82 419 ri[WS(rs, 6)] = FMA(KP951056516, T4r, T4q);
Chris@82 420 T4a = FNMS(KP559016994, T49, T48);
Chris@82 421 ri[WS(rs, 2)] = FNMS(KP951056516, T4p, T4a);
Chris@82 422 ri[WS(rs, 18)] = FMA(KP951056516, T4p, T4a);
Chris@82 423 }
Chris@82 424 {
Chris@82 425 E T54, T56, T4V, T4Y, T4Z, T50, T55, T51;
Chris@82 426 {
Chris@82 427 E T52, T53, T4W, T4X;
Chris@82 428 T52 = T20 - T2r;
Chris@82 429 T53 = T11 - T1y;
Chris@82 430 T54 = FNMS(KP618033988, T53, T52);
Chris@82 431 T56 = FMA(KP618033988, T52, T53);
Chris@82 432 T4V = T4L - T4H;
Chris@82 433 T4W = T4k + T4n;
Chris@82 434 T4X = T4d + T4g;
Chris@82 435 T4Y = T4W + T4X;
Chris@82 436 T4Z = FNMS(KP250000000, T4Y, T4V);
Chris@82 437 T50 = T4W - T4X;
Chris@82 438 }
Chris@82 439 ii[WS(rs, 10)] = T4Y + T4V;
Chris@82 440 T55 = FMA(KP559016994, T50, T4Z);
Chris@82 441 ii[WS(rs, 6)] = FNMS(KP951056516, T56, T55);
Chris@82 442 ii[WS(rs, 14)] = FMA(KP951056516, T56, T55);
Chris@82 443 T51 = FNMS(KP559016994, T50, T4Z);
Chris@82 444 ii[WS(rs, 2)] = FMA(KP951056516, T54, T51);
Chris@82 445 ii[WS(rs, 18)] = FNMS(KP951056516, T54, T51);
Chris@82 446 }
Chris@82 447 {
Chris@82 448 E T4B, T4D, T2u, T2B, T4s, T4t, T4C, T4u;
Chris@82 449 {
Chris@82 450 E T4x, T4A, T2x, T2A;
Chris@82 451 T4x = T4v - T4w;
Chris@82 452 T4A = T4y - T4z;
Chris@82 453 T4B = FMA(KP618033988, T4A, T4x);
Chris@82 454 T4D = FNMS(KP618033988, T4x, T4A);
Chris@82 455 T2u = Te + TD;
Chris@82 456 T2x = T2v + T2w;
Chris@82 457 T2A = T2y + T2z;
Chris@82 458 T2B = T2x + T2A;
Chris@82 459 T4s = FNMS(KP250000000, T2B, T2u);
Chris@82 460 T4t = T2x - T2A;
Chris@82 461 }
Chris@82 462 ri[0] = T2u + T2B;
Chris@82 463 T4C = FNMS(KP559016994, T4t, T4s);
Chris@82 464 ri[WS(rs, 12)] = FNMS(KP951056516, T4D, T4C);
Chris@82 465 ri[WS(rs, 8)] = FMA(KP951056516, T4D, T4C);
Chris@82 466 T4u = FMA(KP559016994, T4t, T4s);
Chris@82 467 ri[WS(rs, 4)] = FNMS(KP951056516, T4B, T4u);
Chris@82 468 ri[WS(rs, 16)] = FMA(KP951056516, T4B, T4u);
Chris@82 469 }
Chris@82 470 {
Chris@82 471 E T4S, T4U, T4M, T4G, T4N, T4O, T4T, T4P;
Chris@82 472 {
Chris@82 473 E T4Q, T4R, T4E, T4F;
Chris@82 474 T4Q = T2v - T2w;
Chris@82 475 T4R = T2y - T2z;
Chris@82 476 T4S = FMA(KP618033988, T4R, T4Q);
Chris@82 477 T4U = FNMS(KP618033988, T4Q, T4R);
Chris@82 478 T4M = T4H + T4L;
Chris@82 479 T4E = T4v + T4w;
Chris@82 480 T4F = T4y + T4z;
Chris@82 481 T4G = T4E + T4F;
Chris@82 482 T4N = FNMS(KP250000000, T4G, T4M);
Chris@82 483 T4O = T4E - T4F;
Chris@82 484 }
Chris@82 485 ii[0] = T4G + T4M;
Chris@82 486 T4T = FNMS(KP559016994, T4O, T4N);
Chris@82 487 ii[WS(rs, 8)] = FNMS(KP951056516, T4U, T4T);
Chris@82 488 ii[WS(rs, 12)] = FMA(KP951056516, T4U, T4T);
Chris@82 489 T4P = FMA(KP559016994, T4O, T4N);
Chris@82 490 ii[WS(rs, 4)] = FMA(KP951056516, T4S, T4P);
Chris@82 491 ii[WS(rs, 16)] = FNMS(KP951056516, T4S, T4P);
Chris@82 492 }
Chris@82 493 {
Chris@82 494 E T3L, T3N, T2I, T3d, T3e, T3f, T3M, T3g;
Chris@82 495 {
Chris@82 496 E T3v, T3K, T2X, T3c;
Chris@82 497 T3v = T3n - T3u;
Chris@82 498 T3K = T3C - T3J;
Chris@82 499 T3L = FNMS(KP618033988, T3K, T3v);
Chris@82 500 T3N = FMA(KP618033988, T3v, T3K);
Chris@82 501 T2I = T2C - T2H;
Chris@82 502 T2X = T2P + T2W;
Chris@82 503 T3c = T34 + T3b;
Chris@82 504 T3d = T2X + T3c;
Chris@82 505 T3e = FNMS(KP250000000, T3d, T2I);
Chris@82 506 T3f = T2X - T3c;
Chris@82 507 }
Chris@82 508 ri[WS(rs, 15)] = T2I + T3d;
Chris@82 509 T3M = FMA(KP559016994, T3f, T3e);
Chris@82 510 ri[WS(rs, 11)] = FMA(KP951056516, T3N, T3M);
Chris@82 511 ri[WS(rs, 19)] = FNMS(KP951056516, T3N, T3M);
Chris@82 512 T3g = FNMS(KP559016994, T3f, T3e);
Chris@82 513 ri[WS(rs, 3)] = FMA(KP951056516, T3L, T3g);
Chris@82 514 ri[WS(rs, 7)] = FNMS(KP951056516, T3L, T3g);
Chris@82 515 }
Chris@82 516 {
Chris@82 517 E T5u, T5w, T5l, T5o, T5p, T5q, T5v, T5r;
Chris@82 518 {
Chris@82 519 E T5s, T5t, T5m, T5n;
Chris@82 520 T5s = T34 - T3b;
Chris@82 521 T5t = T2P - T2W;
Chris@82 522 T5u = FNMS(KP618033988, T5t, T5s);
Chris@82 523 T5w = FMA(KP618033988, T5s, T5t);
Chris@82 524 T5l = T58 + T57;
Chris@82 525 T5m = T3C + T3J;
Chris@82 526 T5n = T3n + T3u;
Chris@82 527 T5o = T5m + T5n;
Chris@82 528 T5p = FNMS(KP250000000, T5o, T5l);
Chris@82 529 T5q = T5m - T5n;
Chris@82 530 }
Chris@82 531 ii[WS(rs, 15)] = T5o + T5l;
Chris@82 532 T5v = FMA(KP559016994, T5q, T5p);
Chris@82 533 ii[WS(rs, 11)] = FNMS(KP951056516, T5w, T5v);
Chris@82 534 ii[WS(rs, 19)] = FMA(KP951056516, T5w, T5v);
Chris@82 535 T5r = FNMS(KP559016994, T5q, T5p);
Chris@82 536 ii[WS(rs, 3)] = FNMS(KP951056516, T5u, T5r);
Chris@82 537 ii[WS(rs, 7)] = FMA(KP951056516, T5u, T5r);
Chris@82 538 }
Chris@82 539 {
Chris@82 540 E T45, T47, T3O, T3V, T3W, T3X, T46, T3Y;
Chris@82 541 {
Chris@82 542 E T41, T44, T3R, T3U;
Chris@82 543 T41 = T3Z - T40;
Chris@82 544 T44 = T42 - T43;
Chris@82 545 T45 = FMA(KP618033988, T44, T41);
Chris@82 546 T47 = FNMS(KP618033988, T41, T44);
Chris@82 547 T3O = T2C + T2H;
Chris@82 548 T3R = T3P + T3Q;
Chris@82 549 T3U = T3S + T3T;
Chris@82 550 T3V = T3R + T3U;
Chris@82 551 T3W = FNMS(KP250000000, T3V, T3O);
Chris@82 552 T3X = T3R - T3U;
Chris@82 553 }
Chris@82 554 ri[WS(rs, 5)] = T3O + T3V;
Chris@82 555 T46 = FNMS(KP559016994, T3X, T3W);
Chris@82 556 ri[WS(rs, 13)] = FMA(KP951056516, T47, T46);
Chris@82 557 ri[WS(rs, 17)] = FNMS(KP951056516, T47, T46);
Chris@82 558 T3Y = FMA(KP559016994, T3X, T3W);
Chris@82 559 ri[WS(rs, 1)] = FMA(KP951056516, T45, T3Y);
Chris@82 560 ri[WS(rs, 9)] = FNMS(KP951056516, T45, T3Y);
Chris@82 561 }
Chris@82 562 {
Chris@82 563 E T5i, T5k, T59, T5c, T5d, T5e, T5j, T5f;
Chris@82 564 {
Chris@82 565 E T5g, T5h, T5a, T5b;
Chris@82 566 T5g = T3P - T3Q;
Chris@82 567 T5h = T3S - T3T;
Chris@82 568 T5i = FMA(KP618033988, T5h, T5g);
Chris@82 569 T5k = FNMS(KP618033988, T5g, T5h);
Chris@82 570 T59 = T57 - T58;
Chris@82 571 T5a = T3Z + T40;
Chris@82 572 T5b = T42 + T43;
Chris@82 573 T5c = T5a + T5b;
Chris@82 574 T5d = FNMS(KP250000000, T5c, T59);
Chris@82 575 T5e = T5a - T5b;
Chris@82 576 }
Chris@82 577 ii[WS(rs, 5)] = T5c + T59;
Chris@82 578 T5j = FNMS(KP559016994, T5e, T5d);
Chris@82 579 ii[WS(rs, 13)] = FNMS(KP951056516, T5k, T5j);
Chris@82 580 ii[WS(rs, 17)] = FMA(KP951056516, T5k, T5j);
Chris@82 581 T5f = FMA(KP559016994, T5e, T5d);
Chris@82 582 ii[WS(rs, 1)] = FNMS(KP951056516, T5i, T5f);
Chris@82 583 ii[WS(rs, 9)] = FMA(KP951056516, T5i, T5f);
Chris@82 584 }
Chris@82 585 }
Chris@82 586 }
Chris@82 587 }
Chris@82 588 }
Chris@82 589
Chris@82 590 static const tw_instr twinstr[] = {
Chris@82 591 {TW_CEXP, 0, 1},
Chris@82 592 {TW_CEXP, 0, 3},
Chris@82 593 {TW_CEXP, 0, 9},
Chris@82 594 {TW_CEXP, 0, 19},
Chris@82 595 {TW_NEXT, 1, 0}
Chris@82 596 };
Chris@82 597
Chris@82 598 static const ct_desc desc = { 20, "t2_20", twinstr, &GENUS, {136, 58, 140, 0}, 0, 0, 0 };
Chris@82 599
Chris@82 600 void X(codelet_t2_20) (planner *p) {
Chris@82 601 X(kdft_dit_register) (p, t2_20, &desc);
Chris@82 602 }
Chris@82 603 #else
Chris@82 604
Chris@82 605 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -name t2_20 -include dft/scalar/t.h */
Chris@82 606
Chris@82 607 /*
Chris@82 608 * This function contains 276 FP additions, 164 FP multiplications,
Chris@82 609 * (or, 204 additions, 92 multiplications, 72 fused multiply/add),
Chris@82 610 * 123 stack variables, 4 constants, and 80 memory accesses
Chris@82 611 */
Chris@82 612 #include "dft/scalar/t.h"
Chris@82 613
Chris@82 614 static void t2_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 615 {
Chris@82 616 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 617 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 618 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 619 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 620 {
Chris@82 621 INT m;
Chris@82 622 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 623 E T2, T5, Tg, Ti, Tk, To, T1h, T1f, T6, T3, T8, T14, T1Q, Tc, T1O;
Chris@82 624 E T1v, T18, T1t, T1n, T24, T1j, T22, Tq, Tu, T1E, T1G, Tx, Ty, Tz, TJ;
Chris@82 625 E T1Z, TB, T1X, T1A, TZ, TL, T1y, TX;
Chris@82 626 {
Chris@82 627 E T7, T16, Ta, T13, T4, T17, Tb, T12;
Chris@82 628 {
Chris@82 629 E Th, Tn, Tj, Tm;
Chris@82 630 T2 = W[0];
Chris@82 631 T5 = W[1];
Chris@82 632 Tg = W[2];
Chris@82 633 Ti = W[3];
Chris@82 634 Th = T2 * Tg;
Chris@82 635 Tn = T5 * Tg;
Chris@82 636 Tj = T5 * Ti;
Chris@82 637 Tm = T2 * Ti;
Chris@82 638 Tk = Th - Tj;
Chris@82 639 To = Tm + Tn;
Chris@82 640 T1h = Tm - Tn;
Chris@82 641 T1f = Th + Tj;
Chris@82 642 T6 = W[5];
Chris@82 643 T7 = T5 * T6;
Chris@82 644 T16 = Tg * T6;
Chris@82 645 Ta = T2 * T6;
Chris@82 646 T13 = Ti * T6;
Chris@82 647 T3 = W[4];
Chris@82 648 T4 = T2 * T3;
Chris@82 649 T17 = Ti * T3;
Chris@82 650 Tb = T5 * T3;
Chris@82 651 T12 = Tg * T3;
Chris@82 652 }
Chris@82 653 T8 = T4 - T7;
Chris@82 654 T14 = T12 + T13;
Chris@82 655 T1Q = T16 + T17;
Chris@82 656 Tc = Ta + Tb;
Chris@82 657 T1O = T12 - T13;
Chris@82 658 T1v = Ta - Tb;
Chris@82 659 T18 = T16 - T17;
Chris@82 660 T1t = T4 + T7;
Chris@82 661 {
Chris@82 662 E T1l, T1m, T1g, T1i;
Chris@82 663 T1l = T1f * T6;
Chris@82 664 T1m = T1h * T3;
Chris@82 665 T1n = T1l + T1m;
Chris@82 666 T24 = T1l - T1m;
Chris@82 667 T1g = T1f * T3;
Chris@82 668 T1i = T1h * T6;
Chris@82 669 T1j = T1g - T1i;
Chris@82 670 T22 = T1g + T1i;
Chris@82 671 {
Chris@82 672 E Tl, Tp, Ts, Tt;
Chris@82 673 Tl = Tk * T3;
Chris@82 674 Tp = To * T6;
Chris@82 675 Tq = Tl + Tp;
Chris@82 676 Ts = Tk * T6;
Chris@82 677 Tt = To * T3;
Chris@82 678 Tu = Ts - Tt;
Chris@82 679 T1E = Tl - Tp;
Chris@82 680 T1G = Ts + Tt;
Chris@82 681 Tx = W[6];
Chris@82 682 Ty = W[7];
Chris@82 683 Tz = FMA(Tk, Tx, To * Ty);
Chris@82 684 TJ = FMA(Tq, Tx, Tu * Ty);
Chris@82 685 T1Z = FNMS(T1h, Tx, T1f * Ty);
Chris@82 686 TB = FNMS(To, Tx, Tk * Ty);
Chris@82 687 T1X = FMA(T1f, Tx, T1h * Ty);
Chris@82 688 T1A = FNMS(T5, Tx, T2 * Ty);
Chris@82 689 TZ = FNMS(Ti, Tx, Tg * Ty);
Chris@82 690 TL = FNMS(Tu, Tx, Tq * Ty);
Chris@82 691 T1y = FMA(T2, Tx, T5 * Ty);
Chris@82 692 TX = FMA(Tg, Tx, Ti * Ty);
Chris@82 693 }
Chris@82 694 }
Chris@82 695 }
Chris@82 696 {
Chris@82 697 E TF, T2b, T4A, T4J, T2K, T3r, T4a, T4m, T1N, T28, T29, T3C, T3F, T4o, T3X;
Chris@82 698 E T3Y, T44, T2f, T2g, T2h, T2n, T2s, T4L, T3g, T3h, T4w, T3n, T3o, T3p, T30;
Chris@82 699 E T35, T36, TW, T1r, T1s, T3J, T3M, T4n, T3U, T3V, T43, T2c, T2d, T2e, T2y;
Chris@82 700 E T2D, T4K, T3d, T3e, T4v, T3k, T3l, T3m, T2P, T2U, T2V;
Chris@82 701 {
Chris@82 702 E T1, T48, Te, T47, Tw, T2H, TD, T2I, T9, Td;
Chris@82 703 T1 = ri[0];
Chris@82 704 T48 = ii[0];
Chris@82 705 T9 = ri[WS(rs, 10)];
Chris@82 706 Td = ii[WS(rs, 10)];
Chris@82 707 Te = FMA(T8, T9, Tc * Td);
Chris@82 708 T47 = FNMS(Tc, T9, T8 * Td);
Chris@82 709 {
Chris@82 710 E Tr, Tv, TA, TC;
Chris@82 711 Tr = ri[WS(rs, 5)];
Chris@82 712 Tv = ii[WS(rs, 5)];
Chris@82 713 Tw = FMA(Tq, Tr, Tu * Tv);
Chris@82 714 T2H = FNMS(Tu, Tr, Tq * Tv);
Chris@82 715 TA = ri[WS(rs, 15)];
Chris@82 716 TC = ii[WS(rs, 15)];
Chris@82 717 TD = FMA(Tz, TA, TB * TC);
Chris@82 718 T2I = FNMS(TB, TA, Tz * TC);
Chris@82 719 }
Chris@82 720 {
Chris@82 721 E Tf, TE, T4y, T4z;
Chris@82 722 Tf = T1 + Te;
Chris@82 723 TE = Tw + TD;
Chris@82 724 TF = Tf - TE;
Chris@82 725 T2b = Tf + TE;
Chris@82 726 T4y = T48 - T47;
Chris@82 727 T4z = Tw - TD;
Chris@82 728 T4A = T4y - T4z;
Chris@82 729 T4J = T4z + T4y;
Chris@82 730 }
Chris@82 731 {
Chris@82 732 E T2G, T2J, T46, T49;
Chris@82 733 T2G = T1 - Te;
Chris@82 734 T2J = T2H - T2I;
Chris@82 735 T2K = T2G - T2J;
Chris@82 736 T3r = T2G + T2J;
Chris@82 737 T46 = T2H + T2I;
Chris@82 738 T49 = T47 + T48;
Chris@82 739 T4a = T46 + T49;
Chris@82 740 T4m = T49 - T46;
Chris@82 741 }
Chris@82 742 }
Chris@82 743 {
Chris@82 744 E T1D, T3A, T2l, T2W, T27, T3E, T2r, T34, T1M, T3B, T2m, T2Z, T1W, T3D, T2q;
Chris@82 745 E T31;
Chris@82 746 {
Chris@82 747 E T1x, T2j, T1C, T2k;
Chris@82 748 {
Chris@82 749 E T1u, T1w, T1z, T1B;
Chris@82 750 T1u = ri[WS(rs, 8)];
Chris@82 751 T1w = ii[WS(rs, 8)];
Chris@82 752 T1x = FMA(T1t, T1u, T1v * T1w);
Chris@82 753 T2j = FNMS(T1v, T1u, T1t * T1w);
Chris@82 754 T1z = ri[WS(rs, 18)];
Chris@82 755 T1B = ii[WS(rs, 18)];
Chris@82 756 T1C = FMA(T1y, T1z, T1A * T1B);
Chris@82 757 T2k = FNMS(T1A, T1z, T1y * T1B);
Chris@82 758 }
Chris@82 759 T1D = T1x + T1C;
Chris@82 760 T3A = T2j + T2k;
Chris@82 761 T2l = T2j - T2k;
Chris@82 762 T2W = T1x - T1C;
Chris@82 763 }
Chris@82 764 {
Chris@82 765 E T21, T32, T26, T33;
Chris@82 766 {
Chris@82 767 E T1Y, T20, T23, T25;
Chris@82 768 T1Y = ri[WS(rs, 17)];
Chris@82 769 T20 = ii[WS(rs, 17)];
Chris@82 770 T21 = FMA(T1X, T1Y, T1Z * T20);
Chris@82 771 T32 = FNMS(T1Z, T1Y, T1X * T20);
Chris@82 772 T23 = ri[WS(rs, 7)];
Chris@82 773 T25 = ii[WS(rs, 7)];
Chris@82 774 T26 = FMA(T22, T23, T24 * T25);
Chris@82 775 T33 = FNMS(T24, T23, T22 * T25);
Chris@82 776 }
Chris@82 777 T27 = T21 + T26;
Chris@82 778 T3E = T32 + T33;
Chris@82 779 T2r = T21 - T26;
Chris@82 780 T34 = T32 - T33;
Chris@82 781 }
Chris@82 782 {
Chris@82 783 E T1I, T2X, T1L, T2Y;
Chris@82 784 {
Chris@82 785 E T1F, T1H, T1J, T1K;
Chris@82 786 T1F = ri[WS(rs, 13)];
Chris@82 787 T1H = ii[WS(rs, 13)];
Chris@82 788 T1I = FMA(T1E, T1F, T1G * T1H);
Chris@82 789 T2X = FNMS(T1G, T1F, T1E * T1H);
Chris@82 790 T1J = ri[WS(rs, 3)];
Chris@82 791 T1K = ii[WS(rs, 3)];
Chris@82 792 T1L = FMA(Tg, T1J, Ti * T1K);
Chris@82 793 T2Y = FNMS(Ti, T1J, Tg * T1K);
Chris@82 794 }
Chris@82 795 T1M = T1I + T1L;
Chris@82 796 T3B = T2X + T2Y;
Chris@82 797 T2m = T1I - T1L;
Chris@82 798 T2Z = T2X - T2Y;
Chris@82 799 }
Chris@82 800 {
Chris@82 801 E T1S, T2o, T1V, T2p;
Chris@82 802 {
Chris@82 803 E T1P, T1R, T1T, T1U;
Chris@82 804 T1P = ri[WS(rs, 12)];
Chris@82 805 T1R = ii[WS(rs, 12)];
Chris@82 806 T1S = FMA(T1O, T1P, T1Q * T1R);
Chris@82 807 T2o = FNMS(T1Q, T1P, T1O * T1R);
Chris@82 808 T1T = ri[WS(rs, 2)];
Chris@82 809 T1U = ii[WS(rs, 2)];
Chris@82 810 T1V = FMA(T1f, T1T, T1h * T1U);
Chris@82 811 T2p = FNMS(T1h, T1T, T1f * T1U);
Chris@82 812 }
Chris@82 813 T1W = T1S + T1V;
Chris@82 814 T3D = T2o + T2p;
Chris@82 815 T2q = T2o - T2p;
Chris@82 816 T31 = T1S - T1V;
Chris@82 817 }
Chris@82 818 T1N = T1D - T1M;
Chris@82 819 T28 = T1W - T27;
Chris@82 820 T29 = T1N + T28;
Chris@82 821 T3C = T3A - T3B;
Chris@82 822 T3F = T3D - T3E;
Chris@82 823 T4o = T3C + T3F;
Chris@82 824 T3X = T3A + T3B;
Chris@82 825 T3Y = T3D + T3E;
Chris@82 826 T44 = T3X + T3Y;
Chris@82 827 T2f = T1D + T1M;
Chris@82 828 T2g = T1W + T27;
Chris@82 829 T2h = T2f + T2g;
Chris@82 830 T2n = T2l + T2m;
Chris@82 831 T2s = T2q + T2r;
Chris@82 832 T4L = T2n + T2s;
Chris@82 833 T3g = T2l - T2m;
Chris@82 834 T3h = T2q - T2r;
Chris@82 835 T4w = T3g + T3h;
Chris@82 836 T3n = T2W + T2Z;
Chris@82 837 T3o = T31 + T34;
Chris@82 838 T3p = T3n + T3o;
Chris@82 839 T30 = T2W - T2Z;
Chris@82 840 T35 = T31 - T34;
Chris@82 841 T36 = T30 + T35;
Chris@82 842 }
Chris@82 843 {
Chris@82 844 E TO, T3H, T2w, T2L, T1q, T3L, T2C, T2T, TV, T3I, T2x, T2O, T1b, T3K, T2B;
Chris@82 845 E T2Q;
Chris@82 846 {
Chris@82 847 E TI, T2u, TN, T2v;
Chris@82 848 {
Chris@82 849 E TG, TH, TK, TM;
Chris@82 850 TG = ri[WS(rs, 4)];
Chris@82 851 TH = ii[WS(rs, 4)];
Chris@82 852 TI = FMA(Tk, TG, To * TH);
Chris@82 853 T2u = FNMS(To, TG, Tk * TH);
Chris@82 854 TK = ri[WS(rs, 14)];
Chris@82 855 TM = ii[WS(rs, 14)];
Chris@82 856 TN = FMA(TJ, TK, TL * TM);
Chris@82 857 T2v = FNMS(TL, TK, TJ * TM);
Chris@82 858 }
Chris@82 859 TO = TI + TN;
Chris@82 860 T3H = T2u + T2v;
Chris@82 861 T2w = T2u - T2v;
Chris@82 862 T2L = TI - TN;
Chris@82 863 }
Chris@82 864 {
Chris@82 865 E T1e, T2R, T1p, T2S;
Chris@82 866 {
Chris@82 867 E T1c, T1d, T1k, T1o;
Chris@82 868 T1c = ri[WS(rs, 1)];
Chris@82 869 T1d = ii[WS(rs, 1)];
Chris@82 870 T1e = FMA(T2, T1c, T5 * T1d);
Chris@82 871 T2R = FNMS(T5, T1c, T2 * T1d);
Chris@82 872 T1k = ri[WS(rs, 11)];
Chris@82 873 T1o = ii[WS(rs, 11)];
Chris@82 874 T1p = FMA(T1j, T1k, T1n * T1o);
Chris@82 875 T2S = FNMS(T1n, T1k, T1j * T1o);
Chris@82 876 }
Chris@82 877 T1q = T1e + T1p;
Chris@82 878 T3L = T2R + T2S;
Chris@82 879 T2C = T1e - T1p;
Chris@82 880 T2T = T2R - T2S;
Chris@82 881 }
Chris@82 882 {
Chris@82 883 E TR, T2M, TU, T2N;
Chris@82 884 {
Chris@82 885 E TP, TQ, TS, TT;
Chris@82 886 TP = ri[WS(rs, 9)];
Chris@82 887 TQ = ii[WS(rs, 9)];
Chris@82 888 TR = FMA(T3, TP, T6 * TQ);
Chris@82 889 T2M = FNMS(T6, TP, T3 * TQ);
Chris@82 890 TS = ri[WS(rs, 19)];
Chris@82 891 TT = ii[WS(rs, 19)];
Chris@82 892 TU = FMA(Tx, TS, Ty * TT);
Chris@82 893 T2N = FNMS(Ty, TS, Tx * TT);
Chris@82 894 }
Chris@82 895 TV = TR + TU;
Chris@82 896 T3I = T2M + T2N;
Chris@82 897 T2x = TR - TU;
Chris@82 898 T2O = T2M - T2N;
Chris@82 899 }
Chris@82 900 {
Chris@82 901 E T11, T2z, T1a, T2A;
Chris@82 902 {
Chris@82 903 E TY, T10, T15, T19;
Chris@82 904 TY = ri[WS(rs, 16)];
Chris@82 905 T10 = ii[WS(rs, 16)];
Chris@82 906 T11 = FMA(TX, TY, TZ * T10);
Chris@82 907 T2z = FNMS(TZ, TY, TX * T10);
Chris@82 908 T15 = ri[WS(rs, 6)];
Chris@82 909 T19 = ii[WS(rs, 6)];
Chris@82 910 T1a = FMA(T14, T15, T18 * T19);
Chris@82 911 T2A = FNMS(T18, T15, T14 * T19);
Chris@82 912 }
Chris@82 913 T1b = T11 + T1a;
Chris@82 914 T3K = T2z + T2A;
Chris@82 915 T2B = T2z - T2A;
Chris@82 916 T2Q = T11 - T1a;
Chris@82 917 }
Chris@82 918 TW = TO - TV;
Chris@82 919 T1r = T1b - T1q;
Chris@82 920 T1s = TW + T1r;
Chris@82 921 T3J = T3H - T3I;
Chris@82 922 T3M = T3K - T3L;
Chris@82 923 T4n = T3J + T3M;
Chris@82 924 T3U = T3H + T3I;
Chris@82 925 T3V = T3K + T3L;
Chris@82 926 T43 = T3U + T3V;
Chris@82 927 T2c = TO + TV;
Chris@82 928 T2d = T1b + T1q;
Chris@82 929 T2e = T2c + T2d;
Chris@82 930 T2y = T2w + T2x;
Chris@82 931 T2D = T2B + T2C;
Chris@82 932 T4K = T2y + T2D;
Chris@82 933 T3d = T2w - T2x;
Chris@82 934 T3e = T2B - T2C;
Chris@82 935 T4v = T3d + T3e;
Chris@82 936 T3k = T2L + T2O;
Chris@82 937 T3l = T2Q + T2T;
Chris@82 938 T3m = T3k + T3l;
Chris@82 939 T2P = T2L - T2O;
Chris@82 940 T2U = T2Q - T2T;
Chris@82 941 T2V = T2P + T2U;
Chris@82 942 }
Chris@82 943 {
Chris@82 944 E T3y, T2a, T3x, T3O, T3Q, T3G, T3N, T3P, T3z;
Chris@82 945 T3y = KP559016994 * (T1s - T29);
Chris@82 946 T2a = T1s + T29;
Chris@82 947 T3x = FNMS(KP250000000, T2a, TF);
Chris@82 948 T3G = T3C - T3F;
Chris@82 949 T3N = T3J - T3M;
Chris@82 950 T3O = FNMS(KP587785252, T3N, KP951056516 * T3G);
Chris@82 951 T3Q = FMA(KP951056516, T3N, KP587785252 * T3G);
Chris@82 952 ri[WS(rs, 10)] = TF + T2a;
Chris@82 953 T3P = T3y + T3x;
Chris@82 954 ri[WS(rs, 14)] = T3P - T3Q;
Chris@82 955 ri[WS(rs, 6)] = T3P + T3Q;
Chris@82 956 T3z = T3x - T3y;
Chris@82 957 ri[WS(rs, 2)] = T3z - T3O;
Chris@82 958 ri[WS(rs, 18)] = T3z + T3O;
Chris@82 959 }
Chris@82 960 {
Chris@82 961 E T4r, T4p, T4q, T4l, T4u, T4j, T4k, T4t, T4s;
Chris@82 962 T4r = KP559016994 * (T4n - T4o);
Chris@82 963 T4p = T4n + T4o;
Chris@82 964 T4q = FNMS(KP250000000, T4p, T4m);
Chris@82 965 T4j = T1N - T28;
Chris@82 966 T4k = TW - T1r;
Chris@82 967 T4l = FNMS(KP587785252, T4k, KP951056516 * T4j);
Chris@82 968 T4u = FMA(KP951056516, T4k, KP587785252 * T4j);
Chris@82 969 ii[WS(rs, 10)] = T4p + T4m;
Chris@82 970 T4t = T4r + T4q;
Chris@82 971 ii[WS(rs, 6)] = T4t - T4u;
Chris@82 972 ii[WS(rs, 14)] = T4u + T4t;
Chris@82 973 T4s = T4q - T4r;
Chris@82 974 ii[WS(rs, 2)] = T4l + T4s;
Chris@82 975 ii[WS(rs, 18)] = T4s - T4l;
Chris@82 976 }
Chris@82 977 {
Chris@82 978 E T3R, T2i, T3S, T40, T42, T3W, T3Z, T41, T3T;
Chris@82 979 T3R = KP559016994 * (T2e - T2h);
Chris@82 980 T2i = T2e + T2h;
Chris@82 981 T3S = FNMS(KP250000000, T2i, T2b);
Chris@82 982 T3W = T3U - T3V;
Chris@82 983 T3Z = T3X - T3Y;
Chris@82 984 T40 = FMA(KP951056516, T3W, KP587785252 * T3Z);
Chris@82 985 T42 = FNMS(KP587785252, T3W, KP951056516 * T3Z);
Chris@82 986 ri[0] = T2b + T2i;
Chris@82 987 T41 = T3S - T3R;
Chris@82 988 ri[WS(rs, 12)] = T41 - T42;
Chris@82 989 ri[WS(rs, 8)] = T41 + T42;
Chris@82 990 T3T = T3R + T3S;
Chris@82 991 ri[WS(rs, 4)] = T3T - T40;
Chris@82 992 ri[WS(rs, 16)] = T3T + T40;
Chris@82 993 }
Chris@82 994 {
Chris@82 995 E T4e, T45, T4f, T4d, T4i, T4b, T4c, T4h, T4g;
Chris@82 996 T4e = KP559016994 * (T43 - T44);
Chris@82 997 T45 = T43 + T44;
Chris@82 998 T4f = FNMS(KP250000000, T45, T4a);
Chris@82 999 T4b = T2c - T2d;
Chris@82 1000 T4c = T2f - T2g;
Chris@82 1001 T4d = FMA(KP951056516, T4b, KP587785252 * T4c);
Chris@82 1002 T4i = FNMS(KP587785252, T4b, KP951056516 * T4c);
Chris@82 1003 ii[0] = T45 + T4a;
Chris@82 1004 T4h = T4f - T4e;
Chris@82 1005 ii[WS(rs, 8)] = T4h - T4i;
Chris@82 1006 ii[WS(rs, 12)] = T4i + T4h;
Chris@82 1007 T4g = T4e + T4f;
Chris@82 1008 ii[WS(rs, 4)] = T4d + T4g;
Chris@82 1009 ii[WS(rs, 16)] = T4g - T4d;
Chris@82 1010 }
Chris@82 1011 {
Chris@82 1012 E T39, T37, T38, T2F, T3b, T2t, T2E, T3c, T3a;
Chris@82 1013 T39 = KP559016994 * (T2V - T36);
Chris@82 1014 T37 = T2V + T36;
Chris@82 1015 T38 = FNMS(KP250000000, T37, T2K);
Chris@82 1016 T2t = T2n - T2s;
Chris@82 1017 T2E = T2y - T2D;
Chris@82 1018 T2F = FNMS(KP587785252, T2E, KP951056516 * T2t);
Chris@82 1019 T3b = FMA(KP951056516, T2E, KP587785252 * T2t);
Chris@82 1020 ri[WS(rs, 15)] = T2K + T37;
Chris@82 1021 T3c = T39 + T38;
Chris@82 1022 ri[WS(rs, 11)] = T3b + T3c;
Chris@82 1023 ri[WS(rs, 19)] = T3c - T3b;
Chris@82 1024 T3a = T38 - T39;
Chris@82 1025 ri[WS(rs, 3)] = T2F + T3a;
Chris@82 1026 ri[WS(rs, 7)] = T3a - T2F;
Chris@82 1027 }
Chris@82 1028 {
Chris@82 1029 E T4O, T4M, T4N, T4S, T4U, T4Q, T4R, T4T, T4P;
Chris@82 1030 T4O = KP559016994 * (T4K - T4L);
Chris@82 1031 T4M = T4K + T4L;
Chris@82 1032 T4N = FNMS(KP250000000, T4M, T4J);
Chris@82 1033 T4Q = T30 - T35;
Chris@82 1034 T4R = T2P - T2U;
Chris@82 1035 T4S = FNMS(KP587785252, T4R, KP951056516 * T4Q);
Chris@82 1036 T4U = FMA(KP951056516, T4R, KP587785252 * T4Q);
Chris@82 1037 ii[WS(rs, 15)] = T4M + T4J;
Chris@82 1038 T4T = T4O + T4N;
Chris@82 1039 ii[WS(rs, 11)] = T4T - T4U;
Chris@82 1040 ii[WS(rs, 19)] = T4U + T4T;
Chris@82 1041 T4P = T4N - T4O;
Chris@82 1042 ii[WS(rs, 3)] = T4P - T4S;
Chris@82 1043 ii[WS(rs, 7)] = T4S + T4P;
Chris@82 1044 }
Chris@82 1045 {
Chris@82 1046 E T3q, T3s, T3t, T3j, T3v, T3f, T3i, T3w, T3u;
Chris@82 1047 T3q = KP559016994 * (T3m - T3p);
Chris@82 1048 T3s = T3m + T3p;
Chris@82 1049 T3t = FNMS(KP250000000, T3s, T3r);
Chris@82 1050 T3f = T3d - T3e;
Chris@82 1051 T3i = T3g - T3h;
Chris@82 1052 T3j = FMA(KP951056516, T3f, KP587785252 * T3i);
Chris@82 1053 T3v = FNMS(KP587785252, T3f, KP951056516 * T3i);
Chris@82 1054 ri[WS(rs, 5)] = T3r + T3s;
Chris@82 1055 T3w = T3t - T3q;
Chris@82 1056 ri[WS(rs, 13)] = T3v + T3w;
Chris@82 1057 ri[WS(rs, 17)] = T3w - T3v;
Chris@82 1058 T3u = T3q + T3t;
Chris@82 1059 ri[WS(rs, 1)] = T3j + T3u;
Chris@82 1060 ri[WS(rs, 9)] = T3u - T3j;
Chris@82 1061 }
Chris@82 1062 {
Chris@82 1063 E T4x, T4B, T4C, T4G, T4I, T4E, T4F, T4H, T4D;
Chris@82 1064 T4x = KP559016994 * (T4v - T4w);
Chris@82 1065 T4B = T4v + T4w;
Chris@82 1066 T4C = FNMS(KP250000000, T4B, T4A);
Chris@82 1067 T4E = T3k - T3l;
Chris@82 1068 T4F = T3n - T3o;
Chris@82 1069 T4G = FMA(KP951056516, T4E, KP587785252 * T4F);
Chris@82 1070 T4I = FNMS(KP587785252, T4E, KP951056516 * T4F);
Chris@82 1071 ii[WS(rs, 5)] = T4B + T4A;
Chris@82 1072 T4H = T4C - T4x;
Chris@82 1073 ii[WS(rs, 13)] = T4H - T4I;
Chris@82 1074 ii[WS(rs, 17)] = T4I + T4H;
Chris@82 1075 T4D = T4x + T4C;
Chris@82 1076 ii[WS(rs, 1)] = T4D - T4G;
Chris@82 1077 ii[WS(rs, 9)] = T4G + T4D;
Chris@82 1078 }
Chris@82 1079 }
Chris@82 1080 }
Chris@82 1081 }
Chris@82 1082 }
Chris@82 1083
Chris@82 1084 static const tw_instr twinstr[] = {
Chris@82 1085 {TW_CEXP, 0, 1},
Chris@82 1086 {TW_CEXP, 0, 3},
Chris@82 1087 {TW_CEXP, 0, 9},
Chris@82 1088 {TW_CEXP, 0, 19},
Chris@82 1089 {TW_NEXT, 1, 0}
Chris@82 1090 };
Chris@82 1091
Chris@82 1092 static const ct_desc desc = { 20, "t2_20", twinstr, &GENUS, {204, 92, 72, 0}, 0, 0, 0 };
Chris@82 1093
Chris@82 1094 void X(codelet_t2_20) (planner *p) {
Chris@82 1095 X(kdft_dit_register) (p, t2_20, &desc);
Chris@82 1096 }
Chris@82 1097 #endif