annotate src/fftw-3.3.3/dft/scalar/codelets/t2_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:36:09 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -name t2_20 -include t.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 276 FP additions, 198 FP multiplications,
Chris@10 32 * (or, 136 additions, 58 multiplications, 140 fused multiply/add),
Chris@10 33 * 142 stack variables, 4 constants, and 80 memory accesses
Chris@10 34 */
Chris@10 35 #include "t.h"
Chris@10 36
Chris@10 37 static void t2_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@10 43 {
Chris@10 44 INT m;
Chris@10 45 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@10 46 E T59, T5i, T5k, T5e, T5c, T5d, T5j, T5f;
Chris@10 47 {
Chris@10 48 E T2, Th, Tf, T6, T5, Tl, T1p, T1n, Ti, T3, Tt, Tv, T24, T1f, T1D;
Chris@10 49 E Tb, T1P, Tm, T21, T1b, T7, T1A, Tw, T1H, T13, TA, T1L, T17, T1S, Tq;
Chris@10 50 E T1o, T2g, T1t, T2c, TO, TK;
Chris@10 51 {
Chris@10 52 E T1e, Ta, Tk, Tg;
Chris@10 53 T2 = W[0];
Chris@10 54 Th = W[3];
Chris@10 55 Tf = W[2];
Chris@10 56 T6 = W[5];
Chris@10 57 T5 = W[1];
Chris@10 58 Tk = T2 * Th;
Chris@10 59 Tg = T2 * Tf;
Chris@10 60 T1e = Tf * T6;
Chris@10 61 Ta = T2 * T6;
Chris@10 62 Tl = FMA(T5, Tf, Tk);
Chris@10 63 T1p = FNMS(T5, Tf, Tk);
Chris@10 64 T1n = FMA(T5, Th, Tg);
Chris@10 65 Ti = FNMS(T5, Th, Tg);
Chris@10 66 T3 = W[4];
Chris@10 67 Tt = W[6];
Chris@10 68 Tv = W[7];
Chris@10 69 {
Chris@10 70 E Tp, Tj, TN, TJ;
Chris@10 71 Tp = Ti * T6;
Chris@10 72 T24 = FMA(Th, T3, T1e);
Chris@10 73 T1f = FNMS(Th, T3, T1e);
Chris@10 74 T1D = FNMS(T5, T3, Ta);
Chris@10 75 Tb = FMA(T5, T3, Ta);
Chris@10 76 Tj = Ti * T3;
Chris@10 77 {
Chris@10 78 E T1a, T4, Tu, T1G;
Chris@10 79 T1a = Tf * T3;
Chris@10 80 T4 = T2 * T3;
Chris@10 81 Tu = Ti * Tt;
Chris@10 82 T1G = T2 * Tt;
Chris@10 83 {
Chris@10 84 E T12, Tz, T1K, T16;
Chris@10 85 T12 = Tf * Tt;
Chris@10 86 Tz = Ti * Tv;
Chris@10 87 T1K = T2 * Tv;
Chris@10 88 T16 = Tf * Tv;
Chris@10 89 T1P = FNMS(Tl, T6, Tj);
Chris@10 90 Tm = FMA(Tl, T6, Tj);
Chris@10 91 T21 = FNMS(Th, T6, T1a);
Chris@10 92 T1b = FMA(Th, T6, T1a);
Chris@10 93 T7 = FNMS(T5, T6, T4);
Chris@10 94 T1A = FMA(T5, T6, T4);
Chris@10 95 Tw = FMA(Tl, Tv, Tu);
Chris@10 96 T1H = FMA(T5, Tv, T1G);
Chris@10 97 T13 = FMA(Th, Tv, T12);
Chris@10 98 TA = FNMS(Tl, Tt, Tz);
Chris@10 99 T1L = FNMS(T5, Tt, T1K);
Chris@10 100 T17 = FNMS(Th, Tt, T16);
Chris@10 101 T1S = FMA(Tl, T3, Tp);
Chris@10 102 Tq = FNMS(Tl, T3, Tp);
Chris@10 103 }
Chris@10 104 }
Chris@10 105 T1o = T1n * T3;
Chris@10 106 T2g = T1n * Tv;
Chris@10 107 TN = Tm * Tv;
Chris@10 108 TJ = Tm * Tt;
Chris@10 109 T1t = T1n * T6;
Chris@10 110 T2c = T1n * Tt;
Chris@10 111 TO = FNMS(Tq, Tt, TN);
Chris@10 112 TK = FMA(Tq, Tv, TJ);
Chris@10 113 }
Chris@10 114 }
Chris@10 115 {
Chris@10 116 E Te, T2C, T4L, T57, T58, TD, T2H, T4H, T3C, T3Z, T11, T2v, T2P, T3P, T4k;
Chris@10 117 E T4v, T3u, T43, T2r, T2z, T3b, T3T, T4g, T4z, T3n, T42, T20, T2y, T34, T3S;
Chris@10 118 E T4d, T4y, T1c, T19, T1d, T3E, T1w, T2U, T1g, T1j, T1l;
Chris@10 119 {
Chris@10 120 E T2d, T2h, T2k, T1q, T1u, T2n, TL, TI, TM, T3x, TZ, T2N, TP, TS, TU;
Chris@10 121 {
Chris@10 122 E T1, T4K, T8, T9, Tc;
Chris@10 123 T1 = ri[0];
Chris@10 124 T4K = ii[0];
Chris@10 125 T8 = ri[WS(rs, 10)];
Chris@10 126 T2d = FMA(T1p, Tv, T2c);
Chris@10 127 T2h = FNMS(T1p, Tt, T2g);
Chris@10 128 T2k = FMA(T1p, T6, T1o);
Chris@10 129 T1q = FNMS(T1p, T6, T1o);
Chris@10 130 T1u = FMA(T1p, T3, T1t);
Chris@10 131 T2n = FNMS(T1p, T3, T1t);
Chris@10 132 T9 = T7 * T8;
Chris@10 133 Tc = ii[WS(rs, 10)];
Chris@10 134 {
Chris@10 135 E Tx, Ts, T2F, TC, T2E;
Chris@10 136 {
Chris@10 137 E Tn, Tr, To, T2D, T4J, Ty, TB, Td, T4I;
Chris@10 138 Tn = ri[WS(rs, 5)];
Chris@10 139 Tr = ii[WS(rs, 5)];
Chris@10 140 Tx = ri[WS(rs, 15)];
Chris@10 141 Td = FMA(Tb, Tc, T9);
Chris@10 142 T4I = T7 * Tc;
Chris@10 143 To = Tm * Tn;
Chris@10 144 T2D = Tm * Tr;
Chris@10 145 Te = T1 + Td;
Chris@10 146 T2C = T1 - Td;
Chris@10 147 T4J = FNMS(Tb, T8, T4I);
Chris@10 148 Ty = Tw * Tx;
Chris@10 149 TB = ii[WS(rs, 15)];
Chris@10 150 Ts = FMA(Tq, Tr, To);
Chris@10 151 T4L = T4J + T4K;
Chris@10 152 T57 = T4K - T4J;
Chris@10 153 T2F = Tw * TB;
Chris@10 154 TC = FMA(TA, TB, Ty);
Chris@10 155 T2E = FNMS(Tq, Tn, T2D);
Chris@10 156 }
Chris@10 157 {
Chris@10 158 E TF, TG, TH, TW, TY, T2G, T3w, TX, T2M;
Chris@10 159 TF = ri[WS(rs, 4)];
Chris@10 160 T2G = FNMS(TA, Tx, T2F);
Chris@10 161 T58 = Ts - TC;
Chris@10 162 TD = Ts + TC;
Chris@10 163 TG = Ti * TF;
Chris@10 164 T2H = T2E - T2G;
Chris@10 165 T4H = T2E + T2G;
Chris@10 166 TH = ii[WS(rs, 4)];
Chris@10 167 TW = ri[WS(rs, 19)];
Chris@10 168 TY = ii[WS(rs, 19)];
Chris@10 169 TL = ri[WS(rs, 14)];
Chris@10 170 TI = FMA(Tl, TH, TG);
Chris@10 171 T3w = Ti * TH;
Chris@10 172 TX = Tt * TW;
Chris@10 173 T2M = Tt * TY;
Chris@10 174 TM = TK * TL;
Chris@10 175 T3x = FNMS(Tl, TF, T3w);
Chris@10 176 TZ = FMA(Tv, TY, TX);
Chris@10 177 T2N = FNMS(Tv, TW, T2M);
Chris@10 178 TP = ii[WS(rs, 14)];
Chris@10 179 TS = ri[WS(rs, 9)];
Chris@10 180 TU = ii[WS(rs, 9)];
Chris@10 181 }
Chris@10 182 }
Chris@10 183 }
Chris@10 184 {
Chris@10 185 E T27, T26, T28, T3p, T2p, T39, T29, T2e, T2i;
Chris@10 186 {
Chris@10 187 E T22, T23, T25, T2l, T2o, T3o, T2m, T38;
Chris@10 188 {
Chris@10 189 E TR, T2J, T3z, TV, T2L, T4i, T3A;
Chris@10 190 T22 = ri[WS(rs, 12)];
Chris@10 191 {
Chris@10 192 E TQ, T3y, TT, T2K;
Chris@10 193 TQ = FMA(TO, TP, TM);
Chris@10 194 T3y = TK * TP;
Chris@10 195 TT = T3 * TS;
Chris@10 196 T2K = T3 * TU;
Chris@10 197 TR = TI + TQ;
Chris@10 198 T2J = TI - TQ;
Chris@10 199 T3z = FNMS(TO, TL, T3y);
Chris@10 200 TV = FMA(T6, TU, TT);
Chris@10 201 T2L = FNMS(T6, TS, T2K);
Chris@10 202 T23 = T21 * T22;
Chris@10 203 }
Chris@10 204 T4i = T3x + T3z;
Chris@10 205 T3A = T3x - T3z;
Chris@10 206 {
Chris@10 207 E T10, T3B, T4j, T2O;
Chris@10 208 T10 = TV + TZ;
Chris@10 209 T3B = TV - TZ;
Chris@10 210 T4j = T2L + T2N;
Chris@10 211 T2O = T2L - T2N;
Chris@10 212 T3C = T3A + T3B;
Chris@10 213 T3Z = T3A - T3B;
Chris@10 214 T11 = TR - T10;
Chris@10 215 T2v = TR + T10;
Chris@10 216 T2P = T2J - T2O;
Chris@10 217 T3P = T2J + T2O;
Chris@10 218 T4k = T4i - T4j;
Chris@10 219 T4v = T4i + T4j;
Chris@10 220 T25 = ii[WS(rs, 12)];
Chris@10 221 }
Chris@10 222 }
Chris@10 223 T2l = ri[WS(rs, 7)];
Chris@10 224 T2o = ii[WS(rs, 7)];
Chris@10 225 T27 = ri[WS(rs, 2)];
Chris@10 226 T26 = FMA(T24, T25, T23);
Chris@10 227 T3o = T21 * T25;
Chris@10 228 T2m = T2k * T2l;
Chris@10 229 T38 = T2k * T2o;
Chris@10 230 T28 = T1n * T27;
Chris@10 231 T3p = FNMS(T24, T22, T3o);
Chris@10 232 T2p = FMA(T2n, T2o, T2m);
Chris@10 233 T39 = FNMS(T2n, T2l, T38);
Chris@10 234 T29 = ii[WS(rs, 2)];
Chris@10 235 T2e = ri[WS(rs, 17)];
Chris@10 236 T2i = ii[WS(rs, 17)];
Chris@10 237 }
Chris@10 238 {
Chris@10 239 E T1I, T1F, T1J, T3i, T1Y, T32, T1M, T1Q, T1T;
Chris@10 240 {
Chris@10 241 E T1B, T1C, T1E, T1V, T1X, T3h, T1W, T31;
Chris@10 242 {
Chris@10 243 E T2b, T35, T3r, T2j, T37, T4e, T3s;
Chris@10 244 T1B = ri[WS(rs, 8)];
Chris@10 245 {
Chris@10 246 E T2a, T3q, T2f, T36;
Chris@10 247 T2a = FMA(T1p, T29, T28);
Chris@10 248 T3q = T1n * T29;
Chris@10 249 T2f = T2d * T2e;
Chris@10 250 T36 = T2d * T2i;
Chris@10 251 T2b = T26 + T2a;
Chris@10 252 T35 = T26 - T2a;
Chris@10 253 T3r = FNMS(T1p, T27, T3q);
Chris@10 254 T2j = FMA(T2h, T2i, T2f);
Chris@10 255 T37 = FNMS(T2h, T2e, T36);
Chris@10 256 T1C = T1A * T1B;
Chris@10 257 }
Chris@10 258 T4e = T3p + T3r;
Chris@10 259 T3s = T3p - T3r;
Chris@10 260 {
Chris@10 261 E T2q, T3t, T4f, T3a;
Chris@10 262 T2q = T2j + T2p;
Chris@10 263 T3t = T2j - T2p;
Chris@10 264 T4f = T37 + T39;
Chris@10 265 T3a = T37 - T39;
Chris@10 266 T3u = T3s + T3t;
Chris@10 267 T43 = T3s - T3t;
Chris@10 268 T2r = T2b - T2q;
Chris@10 269 T2z = T2b + T2q;
Chris@10 270 T3b = T35 - T3a;
Chris@10 271 T3T = T35 + T3a;
Chris@10 272 T4g = T4e - T4f;
Chris@10 273 T4z = T4e + T4f;
Chris@10 274 T1E = ii[WS(rs, 8)];
Chris@10 275 }
Chris@10 276 }
Chris@10 277 T1V = ri[WS(rs, 3)];
Chris@10 278 T1X = ii[WS(rs, 3)];
Chris@10 279 T1I = ri[WS(rs, 18)];
Chris@10 280 T1F = FMA(T1D, T1E, T1C);
Chris@10 281 T3h = T1A * T1E;
Chris@10 282 T1W = Tf * T1V;
Chris@10 283 T31 = Tf * T1X;
Chris@10 284 T1J = T1H * T1I;
Chris@10 285 T3i = FNMS(T1D, T1B, T3h);
Chris@10 286 T1Y = FMA(Th, T1X, T1W);
Chris@10 287 T32 = FNMS(Th, T1V, T31);
Chris@10 288 T1M = ii[WS(rs, 18)];
Chris@10 289 T1Q = ri[WS(rs, 13)];
Chris@10 290 T1T = ii[WS(rs, 13)];
Chris@10 291 }
Chris@10 292 {
Chris@10 293 E T14, T15, T18, T1r, T1v, T3D, T1s, T2T;
Chris@10 294 {
Chris@10 295 E T1O, T2Y, T3k, T1U, T30, T4b, T3l;
Chris@10 296 T14 = ri[WS(rs, 16)];
Chris@10 297 {
Chris@10 298 E T1N, T3j, T1R, T2Z;
Chris@10 299 T1N = FMA(T1L, T1M, T1J);
Chris@10 300 T3j = T1H * T1M;
Chris@10 301 T1R = T1P * T1Q;
Chris@10 302 T2Z = T1P * T1T;
Chris@10 303 T1O = T1F + T1N;
Chris@10 304 T2Y = T1F - T1N;
Chris@10 305 T3k = FNMS(T1L, T1I, T3j);
Chris@10 306 T1U = FMA(T1S, T1T, T1R);
Chris@10 307 T30 = FNMS(T1S, T1Q, T2Z);
Chris@10 308 T15 = T13 * T14;
Chris@10 309 }
Chris@10 310 T4b = T3i + T3k;
Chris@10 311 T3l = T3i - T3k;
Chris@10 312 {
Chris@10 313 E T1Z, T3m, T4c, T33;
Chris@10 314 T1Z = T1U + T1Y;
Chris@10 315 T3m = T1U - T1Y;
Chris@10 316 T4c = T30 + T32;
Chris@10 317 T33 = T30 - T32;
Chris@10 318 T3n = T3l + T3m;
Chris@10 319 T42 = T3l - T3m;
Chris@10 320 T20 = T1O - T1Z;
Chris@10 321 T2y = T1O + T1Z;
Chris@10 322 T34 = T2Y - T33;
Chris@10 323 T3S = T2Y + T33;
Chris@10 324 T4d = T4b - T4c;
Chris@10 325 T4y = T4b + T4c;
Chris@10 326 T18 = ii[WS(rs, 16)];
Chris@10 327 }
Chris@10 328 }
Chris@10 329 T1r = ri[WS(rs, 11)];
Chris@10 330 T1v = ii[WS(rs, 11)];
Chris@10 331 T1c = ri[WS(rs, 6)];
Chris@10 332 T19 = FMA(T17, T18, T15);
Chris@10 333 T3D = T13 * T18;
Chris@10 334 T1s = T1q * T1r;
Chris@10 335 T2T = T1q * T1v;
Chris@10 336 T1d = T1b * T1c;
Chris@10 337 T3E = FNMS(T17, T14, T3D);
Chris@10 338 T1w = FMA(T1u, T1v, T1s);
Chris@10 339 T2U = FNMS(T1u, T1r, T2T);
Chris@10 340 T1g = ii[WS(rs, 6)];
Chris@10 341 T1j = ri[WS(rs, 1)];
Chris@10 342 T1l = ii[WS(rs, 1)];
Chris@10 343 }
Chris@10 344 }
Chris@10 345 }
Chris@10 346 }
Chris@10 347 {
Chris@10 348 E T3J, T40, T2W, T3Q, T4M, T4E, T4F, T4U, T4S;
Chris@10 349 {
Chris@10 350 E T4X, T2u, T2w, T4w, T4W, T4r, T4p, T54, T56, T4V, T4a, T4q;
Chris@10 351 {
Chris@10 352 E T4h, TE, T4n, T53, T1z, T2s, T52;
Chris@10 353 {
Chris@10 354 E T1i, T2Q, T3G, T1m, T2S, T4l, T3H;
Chris@10 355 T4h = T4d - T4g;
Chris@10 356 T4X = T4d + T4g;
Chris@10 357 {
Chris@10 358 E T1h, T3F, T1k, T2R;
Chris@10 359 T1h = FMA(T1f, T1g, T1d);
Chris@10 360 T3F = T1b * T1g;
Chris@10 361 T1k = T2 * T1j;
Chris@10 362 T2R = T2 * T1l;
Chris@10 363 T1i = T19 + T1h;
Chris@10 364 T2Q = T19 - T1h;
Chris@10 365 T3G = FNMS(T1f, T1c, T3F);
Chris@10 366 T1m = FMA(T5, T1l, T1k);
Chris@10 367 T2S = FNMS(T5, T1j, T2R);
Chris@10 368 }
Chris@10 369 TE = Te - TD;
Chris@10 370 T2u = Te + TD;
Chris@10 371 T4l = T3E + T3G;
Chris@10 372 T3H = T3E - T3G;
Chris@10 373 {
Chris@10 374 E T1x, T3I, T4m, T2V, T1y;
Chris@10 375 T1x = T1m + T1w;
Chris@10 376 T3I = T1m - T1w;
Chris@10 377 T4m = T2S + T2U;
Chris@10 378 T2V = T2S - T2U;
Chris@10 379 T3J = T3H + T3I;
Chris@10 380 T40 = T3H - T3I;
Chris@10 381 T1y = T1i - T1x;
Chris@10 382 T2w = T1i + T1x;
Chris@10 383 T2W = T2Q - T2V;
Chris@10 384 T3Q = T2Q + T2V;
Chris@10 385 T4n = T4l - T4m;
Chris@10 386 T4w = T4l + T4m;
Chris@10 387 T53 = T11 - T1y;
Chris@10 388 T1z = T11 + T1y;
Chris@10 389 T2s = T20 + T2r;
Chris@10 390 T52 = T20 - T2r;
Chris@10 391 }
Chris@10 392 }
Chris@10 393 {
Chris@10 394 E T49, T48, T4o, T2t;
Chris@10 395 T4o = T4k - T4n;
Chris@10 396 T4W = T4k + T4n;
Chris@10 397 T49 = T1z - T2s;
Chris@10 398 T2t = T1z + T2s;
Chris@10 399 T4r = FMA(KP618033988, T4h, T4o);
Chris@10 400 T4p = FNMS(KP618033988, T4o, T4h);
Chris@10 401 T54 = FNMS(KP618033988, T53, T52);
Chris@10 402 T56 = FMA(KP618033988, T52, T53);
Chris@10 403 ri[WS(rs, 10)] = TE + T2t;
Chris@10 404 T48 = FNMS(KP250000000, T2t, TE);
Chris@10 405 T4V = T4L - T4H;
Chris@10 406 T4M = T4H + T4L;
Chris@10 407 T4a = FNMS(KP559016994, T49, T48);
Chris@10 408 T4q = FMA(KP559016994, T49, T48);
Chris@10 409 }
Chris@10 410 }
Chris@10 411 {
Chris@10 412 E T2x, T4Q, T4B, T4D, T4R, T2A, T51, T55;
Chris@10 413 {
Chris@10 414 E T4x, T50, T4Y, T4A, T4Z;
Chris@10 415 T4E = T4v + T4w;
Chris@10 416 T4x = T4v - T4w;
Chris@10 417 ri[WS(rs, 18)] = FMA(KP951056516, T4p, T4a);
Chris@10 418 ri[WS(rs, 2)] = FNMS(KP951056516, T4p, T4a);
Chris@10 419 ri[WS(rs, 6)] = FMA(KP951056516, T4r, T4q);
Chris@10 420 ri[WS(rs, 14)] = FNMS(KP951056516, T4r, T4q);
Chris@10 421 T50 = T4W - T4X;
Chris@10 422 T4Y = T4W + T4X;
Chris@10 423 T4A = T4y - T4z;
Chris@10 424 T4F = T4y + T4z;
Chris@10 425 T2x = T2v + T2w;
Chris@10 426 T4Q = T2v - T2w;
Chris@10 427 ii[WS(rs, 10)] = T4Y + T4V;
Chris@10 428 T4Z = FNMS(KP250000000, T4Y, T4V);
Chris@10 429 T4B = FMA(KP618033988, T4A, T4x);
Chris@10 430 T4D = FNMS(KP618033988, T4x, T4A);
Chris@10 431 T4R = T2y - T2z;
Chris@10 432 T2A = T2y + T2z;
Chris@10 433 T51 = FNMS(KP559016994, T50, T4Z);
Chris@10 434 T55 = FMA(KP559016994, T50, T4Z);
Chris@10 435 }
Chris@10 436 {
Chris@10 437 E T4t, T4s, T2B, T4u, T4C;
Chris@10 438 T2B = T2x + T2A;
Chris@10 439 T4t = T2x - T2A;
Chris@10 440 ii[WS(rs, 18)] = FNMS(KP951056516, T54, T51);
Chris@10 441 ii[WS(rs, 2)] = FMA(KP951056516, T54, T51);
Chris@10 442 ii[WS(rs, 14)] = FMA(KP951056516, T56, T55);
Chris@10 443 ii[WS(rs, 6)] = FNMS(KP951056516, T56, T55);
Chris@10 444 ri[0] = T2u + T2B;
Chris@10 445 T4s = FNMS(KP250000000, T2B, T2u);
Chris@10 446 T4u = FMA(KP559016994, T4t, T4s);
Chris@10 447 T4C = FNMS(KP559016994, T4t, T4s);
Chris@10 448 T4U = FNMS(KP618033988, T4Q, T4R);
Chris@10 449 T4S = FMA(KP618033988, T4R, T4Q);
Chris@10 450 ri[WS(rs, 16)] = FMA(KP951056516, T4B, T4u);
Chris@10 451 ri[WS(rs, 4)] = FNMS(KP951056516, T4B, T4u);
Chris@10 452 ri[WS(rs, 8)] = FMA(KP951056516, T4D, T4C);
Chris@10 453 ri[WS(rs, 12)] = FNMS(KP951056516, T4D, T4C);
Chris@10 454 }
Chris@10 455 }
Chris@10 456 }
Chris@10 457 {
Chris@10 458 E T3O, T5u, T5w, T5l, T5q, T5o;
Chris@10 459 {
Chris@10 460 E T5n, T5m, T2I, T4O, T3N, T3L, T2X, T5t, T4N, T5s, T3c, T3v, T3K, T4G;
Chris@10 461 T5n = T3n + T3u;
Chris@10 462 T3v = T3n - T3u;
Chris@10 463 T3K = T3C - T3J;
Chris@10 464 T5m = T3C + T3J;
Chris@10 465 T3O = T2C + T2H;
Chris@10 466 T2I = T2C - T2H;
Chris@10 467 T4O = T4E - T4F;
Chris@10 468 T4G = T4E + T4F;
Chris@10 469 T3N = FMA(KP618033988, T3v, T3K);
Chris@10 470 T3L = FNMS(KP618033988, T3K, T3v);
Chris@10 471 T2X = T2P + T2W;
Chris@10 472 T5t = T2P - T2W;
Chris@10 473 ii[0] = T4G + T4M;
Chris@10 474 T4N = FNMS(KP250000000, T4G, T4M);
Chris@10 475 T5s = T34 - T3b;
Chris@10 476 T3c = T34 + T3b;
Chris@10 477 {
Chris@10 478 E T3f, T3e, T4P, T4T, T3d, T3M, T3g;
Chris@10 479 T4P = FMA(KP559016994, T4O, T4N);
Chris@10 480 T4T = FNMS(KP559016994, T4O, T4N);
Chris@10 481 T3f = T2X - T3c;
Chris@10 482 T3d = T2X + T3c;
Chris@10 483 ii[WS(rs, 16)] = FNMS(KP951056516, T4S, T4P);
Chris@10 484 ii[WS(rs, 4)] = FMA(KP951056516, T4S, T4P);
Chris@10 485 ii[WS(rs, 12)] = FMA(KP951056516, T4U, T4T);
Chris@10 486 ii[WS(rs, 8)] = FNMS(KP951056516, T4U, T4T);
Chris@10 487 ri[WS(rs, 15)] = T2I + T3d;
Chris@10 488 T3e = FNMS(KP250000000, T3d, T2I);
Chris@10 489 T5u = FNMS(KP618033988, T5t, T5s);
Chris@10 490 T5w = FMA(KP618033988, T5s, T5t);
Chris@10 491 T5l = T58 + T57;
Chris@10 492 T59 = T57 - T58;
Chris@10 493 T3M = FMA(KP559016994, T3f, T3e);
Chris@10 494 T3g = FNMS(KP559016994, T3f, T3e);
Chris@10 495 ri[WS(rs, 7)] = FNMS(KP951056516, T3L, T3g);
Chris@10 496 ri[WS(rs, 3)] = FMA(KP951056516, T3L, T3g);
Chris@10 497 ri[WS(rs, 19)] = FNMS(KP951056516, T3N, T3M);
Chris@10 498 ri[WS(rs, 11)] = FMA(KP951056516, T3N, T3M);
Chris@10 499 T5q = T5m - T5n;
Chris@10 500 T5o = T5m + T5n;
Chris@10 501 }
Chris@10 502 }
Chris@10 503 {
Chris@10 504 E T5a, T5b, T47, T45, T5g, T5h, T3V, T3X, T41, T44, T5p, T3W, T46, T3Y;
Chris@10 505 T5a = T3Z + T40;
Chris@10 506 T41 = T3Z - T40;
Chris@10 507 T44 = T42 - T43;
Chris@10 508 T5b = T42 + T43;
Chris@10 509 ii[WS(rs, 15)] = T5o + T5l;
Chris@10 510 T5p = FNMS(KP250000000, T5o, T5l);
Chris@10 511 T47 = FNMS(KP618033988, T41, T44);
Chris@10 512 T45 = FMA(KP618033988, T44, T41);
Chris@10 513 {
Chris@10 514 E T5r, T5v, T3R, T3U;
Chris@10 515 T5r = FNMS(KP559016994, T5q, T5p);
Chris@10 516 T5v = FMA(KP559016994, T5q, T5p);
Chris@10 517 T3R = T3P + T3Q;
Chris@10 518 T5g = T3P - T3Q;
Chris@10 519 T5h = T3S - T3T;
Chris@10 520 T3U = T3S + T3T;
Chris@10 521 ii[WS(rs, 7)] = FMA(KP951056516, T5u, T5r);
Chris@10 522 ii[WS(rs, 3)] = FNMS(KP951056516, T5u, T5r);
Chris@10 523 ii[WS(rs, 19)] = FMA(KP951056516, T5w, T5v);
Chris@10 524 ii[WS(rs, 11)] = FNMS(KP951056516, T5w, T5v);
Chris@10 525 T3V = T3R + T3U;
Chris@10 526 T3X = T3R - T3U;
Chris@10 527 }
Chris@10 528 ri[WS(rs, 5)] = T3O + T3V;
Chris@10 529 T3W = FNMS(KP250000000, T3V, T3O);
Chris@10 530 T5i = FMA(KP618033988, T5h, T5g);
Chris@10 531 T5k = FNMS(KP618033988, T5g, T5h);
Chris@10 532 T46 = FNMS(KP559016994, T3X, T3W);
Chris@10 533 T3Y = FMA(KP559016994, T3X, T3W);
Chris@10 534 ri[WS(rs, 9)] = FNMS(KP951056516, T45, T3Y);
Chris@10 535 ri[WS(rs, 1)] = FMA(KP951056516, T45, T3Y);
Chris@10 536 ri[WS(rs, 17)] = FNMS(KP951056516, T47, T46);
Chris@10 537 ri[WS(rs, 13)] = FMA(KP951056516, T47, T46);
Chris@10 538 T5e = T5a - T5b;
Chris@10 539 T5c = T5a + T5b;
Chris@10 540 }
Chris@10 541 }
Chris@10 542 }
Chris@10 543 }
Chris@10 544 }
Chris@10 545 ii[WS(rs, 5)] = T5c + T59;
Chris@10 546 T5d = FNMS(KP250000000, T5c, T59);
Chris@10 547 T5j = FNMS(KP559016994, T5e, T5d);
Chris@10 548 T5f = FMA(KP559016994, T5e, T5d);
Chris@10 549 ii[WS(rs, 9)] = FMA(KP951056516, T5i, T5f);
Chris@10 550 ii[WS(rs, 1)] = FNMS(KP951056516, T5i, T5f);
Chris@10 551 ii[WS(rs, 17)] = FMA(KP951056516, T5k, T5j);
Chris@10 552 ii[WS(rs, 13)] = FNMS(KP951056516, T5k, T5j);
Chris@10 553 }
Chris@10 554 }
Chris@10 555 }
Chris@10 556
Chris@10 557 static const tw_instr twinstr[] = {
Chris@10 558 {TW_CEXP, 0, 1},
Chris@10 559 {TW_CEXP, 0, 3},
Chris@10 560 {TW_CEXP, 0, 9},
Chris@10 561 {TW_CEXP, 0, 19},
Chris@10 562 {TW_NEXT, 1, 0}
Chris@10 563 };
Chris@10 564
Chris@10 565 static const ct_desc desc = { 20, "t2_20", twinstr, &GENUS, {136, 58, 140, 0}, 0, 0, 0 };
Chris@10 566
Chris@10 567 void X(codelet_t2_20) (planner *p) {
Chris@10 568 X(kdft_dit_register) (p, t2_20, &desc);
Chris@10 569 }
Chris@10 570 #else /* HAVE_FMA */
Chris@10 571
Chris@10 572 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -name t2_20 -include t.h */
Chris@10 573
Chris@10 574 /*
Chris@10 575 * This function contains 276 FP additions, 164 FP multiplications,
Chris@10 576 * (or, 204 additions, 92 multiplications, 72 fused multiply/add),
Chris@10 577 * 123 stack variables, 4 constants, and 80 memory accesses
Chris@10 578 */
Chris@10 579 #include "t.h"
Chris@10 580
Chris@10 581 static void t2_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 582 {
Chris@10 583 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@10 584 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 585 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 586 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 587 {
Chris@10 588 INT m;
Chris@10 589 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@10 590 E T2, T5, Tg, Ti, Tk, To, T1h, T1f, T6, T3, T8, T14, T1Q, Tc, T1O;
Chris@10 591 E T1v, T18, T1t, T1n, T24, T1j, T22, Tq, Tu, T1E, T1G, Tx, Ty, Tz, TJ;
Chris@10 592 E T1Z, TB, T1X, T1A, TZ, TL, T1y, TX;
Chris@10 593 {
Chris@10 594 E T7, T16, Ta, T13, T4, T17, Tb, T12;
Chris@10 595 {
Chris@10 596 E Th, Tn, Tj, Tm;
Chris@10 597 T2 = W[0];
Chris@10 598 T5 = W[1];
Chris@10 599 Tg = W[2];
Chris@10 600 Ti = W[3];
Chris@10 601 Th = T2 * Tg;
Chris@10 602 Tn = T5 * Tg;
Chris@10 603 Tj = T5 * Ti;
Chris@10 604 Tm = T2 * Ti;
Chris@10 605 Tk = Th - Tj;
Chris@10 606 To = Tm + Tn;
Chris@10 607 T1h = Tm - Tn;
Chris@10 608 T1f = Th + Tj;
Chris@10 609 T6 = W[5];
Chris@10 610 T7 = T5 * T6;
Chris@10 611 T16 = Tg * T6;
Chris@10 612 Ta = T2 * T6;
Chris@10 613 T13 = Ti * T6;
Chris@10 614 T3 = W[4];
Chris@10 615 T4 = T2 * T3;
Chris@10 616 T17 = Ti * T3;
Chris@10 617 Tb = T5 * T3;
Chris@10 618 T12 = Tg * T3;
Chris@10 619 }
Chris@10 620 T8 = T4 - T7;
Chris@10 621 T14 = T12 + T13;
Chris@10 622 T1Q = T16 + T17;
Chris@10 623 Tc = Ta + Tb;
Chris@10 624 T1O = T12 - T13;
Chris@10 625 T1v = Ta - Tb;
Chris@10 626 T18 = T16 - T17;
Chris@10 627 T1t = T4 + T7;
Chris@10 628 {
Chris@10 629 E T1l, T1m, T1g, T1i;
Chris@10 630 T1l = T1f * T6;
Chris@10 631 T1m = T1h * T3;
Chris@10 632 T1n = T1l + T1m;
Chris@10 633 T24 = T1l - T1m;
Chris@10 634 T1g = T1f * T3;
Chris@10 635 T1i = T1h * T6;
Chris@10 636 T1j = T1g - T1i;
Chris@10 637 T22 = T1g + T1i;
Chris@10 638 {
Chris@10 639 E Tl, Tp, Ts, Tt;
Chris@10 640 Tl = Tk * T3;
Chris@10 641 Tp = To * T6;
Chris@10 642 Tq = Tl + Tp;
Chris@10 643 Ts = Tk * T6;
Chris@10 644 Tt = To * T3;
Chris@10 645 Tu = Ts - Tt;
Chris@10 646 T1E = Tl - Tp;
Chris@10 647 T1G = Ts + Tt;
Chris@10 648 Tx = W[6];
Chris@10 649 Ty = W[7];
Chris@10 650 Tz = FMA(Tk, Tx, To * Ty);
Chris@10 651 TJ = FMA(Tq, Tx, Tu * Ty);
Chris@10 652 T1Z = FNMS(T1h, Tx, T1f * Ty);
Chris@10 653 TB = FNMS(To, Tx, Tk * Ty);
Chris@10 654 T1X = FMA(T1f, Tx, T1h * Ty);
Chris@10 655 T1A = FNMS(T5, Tx, T2 * Ty);
Chris@10 656 TZ = FNMS(Ti, Tx, Tg * Ty);
Chris@10 657 TL = FNMS(Tu, Tx, Tq * Ty);
Chris@10 658 T1y = FMA(T2, Tx, T5 * Ty);
Chris@10 659 TX = FMA(Tg, Tx, Ti * Ty);
Chris@10 660 }
Chris@10 661 }
Chris@10 662 }
Chris@10 663 {
Chris@10 664 E TF, T2b, T4A, T4J, T2K, T3r, T4a, T4m, T1N, T28, T29, T3C, T3F, T4o, T3X;
Chris@10 665 E T3Y, T44, T2f, T2g, T2h, T2n, T2s, T4L, T3g, T3h, T4w, T3n, T3o, T3p, T30;
Chris@10 666 E T35, T36, TW, T1r, T1s, T3J, T3M, T4n, T3U, T3V, T43, T2c, T2d, T2e, T2y;
Chris@10 667 E T2D, T4K, T3d, T3e, T4v, T3k, T3l, T3m, T2P, T2U, T2V;
Chris@10 668 {
Chris@10 669 E T1, T48, Te, T47, Tw, T2H, TD, T2I, T9, Td;
Chris@10 670 T1 = ri[0];
Chris@10 671 T48 = ii[0];
Chris@10 672 T9 = ri[WS(rs, 10)];
Chris@10 673 Td = ii[WS(rs, 10)];
Chris@10 674 Te = FMA(T8, T9, Tc * Td);
Chris@10 675 T47 = FNMS(Tc, T9, T8 * Td);
Chris@10 676 {
Chris@10 677 E Tr, Tv, TA, TC;
Chris@10 678 Tr = ri[WS(rs, 5)];
Chris@10 679 Tv = ii[WS(rs, 5)];
Chris@10 680 Tw = FMA(Tq, Tr, Tu * Tv);
Chris@10 681 T2H = FNMS(Tu, Tr, Tq * Tv);
Chris@10 682 TA = ri[WS(rs, 15)];
Chris@10 683 TC = ii[WS(rs, 15)];
Chris@10 684 TD = FMA(Tz, TA, TB * TC);
Chris@10 685 T2I = FNMS(TB, TA, Tz * TC);
Chris@10 686 }
Chris@10 687 {
Chris@10 688 E Tf, TE, T4y, T4z;
Chris@10 689 Tf = T1 + Te;
Chris@10 690 TE = Tw + TD;
Chris@10 691 TF = Tf - TE;
Chris@10 692 T2b = Tf + TE;
Chris@10 693 T4y = T48 - T47;
Chris@10 694 T4z = Tw - TD;
Chris@10 695 T4A = T4y - T4z;
Chris@10 696 T4J = T4z + T4y;
Chris@10 697 }
Chris@10 698 {
Chris@10 699 E T2G, T2J, T46, T49;
Chris@10 700 T2G = T1 - Te;
Chris@10 701 T2J = T2H - T2I;
Chris@10 702 T2K = T2G - T2J;
Chris@10 703 T3r = T2G + T2J;
Chris@10 704 T46 = T2H + T2I;
Chris@10 705 T49 = T47 + T48;
Chris@10 706 T4a = T46 + T49;
Chris@10 707 T4m = T49 - T46;
Chris@10 708 }
Chris@10 709 }
Chris@10 710 {
Chris@10 711 E T1D, T3A, T2l, T2W, T27, T3E, T2r, T34, T1M, T3B, T2m, T2Z, T1W, T3D, T2q;
Chris@10 712 E T31;
Chris@10 713 {
Chris@10 714 E T1x, T2j, T1C, T2k;
Chris@10 715 {
Chris@10 716 E T1u, T1w, T1z, T1B;
Chris@10 717 T1u = ri[WS(rs, 8)];
Chris@10 718 T1w = ii[WS(rs, 8)];
Chris@10 719 T1x = FMA(T1t, T1u, T1v * T1w);
Chris@10 720 T2j = FNMS(T1v, T1u, T1t * T1w);
Chris@10 721 T1z = ri[WS(rs, 18)];
Chris@10 722 T1B = ii[WS(rs, 18)];
Chris@10 723 T1C = FMA(T1y, T1z, T1A * T1B);
Chris@10 724 T2k = FNMS(T1A, T1z, T1y * T1B);
Chris@10 725 }
Chris@10 726 T1D = T1x + T1C;
Chris@10 727 T3A = T2j + T2k;
Chris@10 728 T2l = T2j - T2k;
Chris@10 729 T2W = T1x - T1C;
Chris@10 730 }
Chris@10 731 {
Chris@10 732 E T21, T32, T26, T33;
Chris@10 733 {
Chris@10 734 E T1Y, T20, T23, T25;
Chris@10 735 T1Y = ri[WS(rs, 17)];
Chris@10 736 T20 = ii[WS(rs, 17)];
Chris@10 737 T21 = FMA(T1X, T1Y, T1Z * T20);
Chris@10 738 T32 = FNMS(T1Z, T1Y, T1X * T20);
Chris@10 739 T23 = ri[WS(rs, 7)];
Chris@10 740 T25 = ii[WS(rs, 7)];
Chris@10 741 T26 = FMA(T22, T23, T24 * T25);
Chris@10 742 T33 = FNMS(T24, T23, T22 * T25);
Chris@10 743 }
Chris@10 744 T27 = T21 + T26;
Chris@10 745 T3E = T32 + T33;
Chris@10 746 T2r = T21 - T26;
Chris@10 747 T34 = T32 - T33;
Chris@10 748 }
Chris@10 749 {
Chris@10 750 E T1I, T2X, T1L, T2Y;
Chris@10 751 {
Chris@10 752 E T1F, T1H, T1J, T1K;
Chris@10 753 T1F = ri[WS(rs, 13)];
Chris@10 754 T1H = ii[WS(rs, 13)];
Chris@10 755 T1I = FMA(T1E, T1F, T1G * T1H);
Chris@10 756 T2X = FNMS(T1G, T1F, T1E * T1H);
Chris@10 757 T1J = ri[WS(rs, 3)];
Chris@10 758 T1K = ii[WS(rs, 3)];
Chris@10 759 T1L = FMA(Tg, T1J, Ti * T1K);
Chris@10 760 T2Y = FNMS(Ti, T1J, Tg * T1K);
Chris@10 761 }
Chris@10 762 T1M = T1I + T1L;
Chris@10 763 T3B = T2X + T2Y;
Chris@10 764 T2m = T1I - T1L;
Chris@10 765 T2Z = T2X - T2Y;
Chris@10 766 }
Chris@10 767 {
Chris@10 768 E T1S, T2o, T1V, T2p;
Chris@10 769 {
Chris@10 770 E T1P, T1R, T1T, T1U;
Chris@10 771 T1P = ri[WS(rs, 12)];
Chris@10 772 T1R = ii[WS(rs, 12)];
Chris@10 773 T1S = FMA(T1O, T1P, T1Q * T1R);
Chris@10 774 T2o = FNMS(T1Q, T1P, T1O * T1R);
Chris@10 775 T1T = ri[WS(rs, 2)];
Chris@10 776 T1U = ii[WS(rs, 2)];
Chris@10 777 T1V = FMA(T1f, T1T, T1h * T1U);
Chris@10 778 T2p = FNMS(T1h, T1T, T1f * T1U);
Chris@10 779 }
Chris@10 780 T1W = T1S + T1V;
Chris@10 781 T3D = T2o + T2p;
Chris@10 782 T2q = T2o - T2p;
Chris@10 783 T31 = T1S - T1V;
Chris@10 784 }
Chris@10 785 T1N = T1D - T1M;
Chris@10 786 T28 = T1W - T27;
Chris@10 787 T29 = T1N + T28;
Chris@10 788 T3C = T3A - T3B;
Chris@10 789 T3F = T3D - T3E;
Chris@10 790 T4o = T3C + T3F;
Chris@10 791 T3X = T3A + T3B;
Chris@10 792 T3Y = T3D + T3E;
Chris@10 793 T44 = T3X + T3Y;
Chris@10 794 T2f = T1D + T1M;
Chris@10 795 T2g = T1W + T27;
Chris@10 796 T2h = T2f + T2g;
Chris@10 797 T2n = T2l + T2m;
Chris@10 798 T2s = T2q + T2r;
Chris@10 799 T4L = T2n + T2s;
Chris@10 800 T3g = T2l - T2m;
Chris@10 801 T3h = T2q - T2r;
Chris@10 802 T4w = T3g + T3h;
Chris@10 803 T3n = T2W + T2Z;
Chris@10 804 T3o = T31 + T34;
Chris@10 805 T3p = T3n + T3o;
Chris@10 806 T30 = T2W - T2Z;
Chris@10 807 T35 = T31 - T34;
Chris@10 808 T36 = T30 + T35;
Chris@10 809 }
Chris@10 810 {
Chris@10 811 E TO, T3H, T2w, T2L, T1q, T3L, T2C, T2T, TV, T3I, T2x, T2O, T1b, T3K, T2B;
Chris@10 812 E T2Q;
Chris@10 813 {
Chris@10 814 E TI, T2u, TN, T2v;
Chris@10 815 {
Chris@10 816 E TG, TH, TK, TM;
Chris@10 817 TG = ri[WS(rs, 4)];
Chris@10 818 TH = ii[WS(rs, 4)];
Chris@10 819 TI = FMA(Tk, TG, To * TH);
Chris@10 820 T2u = FNMS(To, TG, Tk * TH);
Chris@10 821 TK = ri[WS(rs, 14)];
Chris@10 822 TM = ii[WS(rs, 14)];
Chris@10 823 TN = FMA(TJ, TK, TL * TM);
Chris@10 824 T2v = FNMS(TL, TK, TJ * TM);
Chris@10 825 }
Chris@10 826 TO = TI + TN;
Chris@10 827 T3H = T2u + T2v;
Chris@10 828 T2w = T2u - T2v;
Chris@10 829 T2L = TI - TN;
Chris@10 830 }
Chris@10 831 {
Chris@10 832 E T1e, T2R, T1p, T2S;
Chris@10 833 {
Chris@10 834 E T1c, T1d, T1k, T1o;
Chris@10 835 T1c = ri[WS(rs, 1)];
Chris@10 836 T1d = ii[WS(rs, 1)];
Chris@10 837 T1e = FMA(T2, T1c, T5 * T1d);
Chris@10 838 T2R = FNMS(T5, T1c, T2 * T1d);
Chris@10 839 T1k = ri[WS(rs, 11)];
Chris@10 840 T1o = ii[WS(rs, 11)];
Chris@10 841 T1p = FMA(T1j, T1k, T1n * T1o);
Chris@10 842 T2S = FNMS(T1n, T1k, T1j * T1o);
Chris@10 843 }
Chris@10 844 T1q = T1e + T1p;
Chris@10 845 T3L = T2R + T2S;
Chris@10 846 T2C = T1e - T1p;
Chris@10 847 T2T = T2R - T2S;
Chris@10 848 }
Chris@10 849 {
Chris@10 850 E TR, T2M, TU, T2N;
Chris@10 851 {
Chris@10 852 E TP, TQ, TS, TT;
Chris@10 853 TP = ri[WS(rs, 9)];
Chris@10 854 TQ = ii[WS(rs, 9)];
Chris@10 855 TR = FMA(T3, TP, T6 * TQ);
Chris@10 856 T2M = FNMS(T6, TP, T3 * TQ);
Chris@10 857 TS = ri[WS(rs, 19)];
Chris@10 858 TT = ii[WS(rs, 19)];
Chris@10 859 TU = FMA(Tx, TS, Ty * TT);
Chris@10 860 T2N = FNMS(Ty, TS, Tx * TT);
Chris@10 861 }
Chris@10 862 TV = TR + TU;
Chris@10 863 T3I = T2M + T2N;
Chris@10 864 T2x = TR - TU;
Chris@10 865 T2O = T2M - T2N;
Chris@10 866 }
Chris@10 867 {
Chris@10 868 E T11, T2z, T1a, T2A;
Chris@10 869 {
Chris@10 870 E TY, T10, T15, T19;
Chris@10 871 TY = ri[WS(rs, 16)];
Chris@10 872 T10 = ii[WS(rs, 16)];
Chris@10 873 T11 = FMA(TX, TY, TZ * T10);
Chris@10 874 T2z = FNMS(TZ, TY, TX * T10);
Chris@10 875 T15 = ri[WS(rs, 6)];
Chris@10 876 T19 = ii[WS(rs, 6)];
Chris@10 877 T1a = FMA(T14, T15, T18 * T19);
Chris@10 878 T2A = FNMS(T18, T15, T14 * T19);
Chris@10 879 }
Chris@10 880 T1b = T11 + T1a;
Chris@10 881 T3K = T2z + T2A;
Chris@10 882 T2B = T2z - T2A;
Chris@10 883 T2Q = T11 - T1a;
Chris@10 884 }
Chris@10 885 TW = TO - TV;
Chris@10 886 T1r = T1b - T1q;
Chris@10 887 T1s = TW + T1r;
Chris@10 888 T3J = T3H - T3I;
Chris@10 889 T3M = T3K - T3L;
Chris@10 890 T4n = T3J + T3M;
Chris@10 891 T3U = T3H + T3I;
Chris@10 892 T3V = T3K + T3L;
Chris@10 893 T43 = T3U + T3V;
Chris@10 894 T2c = TO + TV;
Chris@10 895 T2d = T1b + T1q;
Chris@10 896 T2e = T2c + T2d;
Chris@10 897 T2y = T2w + T2x;
Chris@10 898 T2D = T2B + T2C;
Chris@10 899 T4K = T2y + T2D;
Chris@10 900 T3d = T2w - T2x;
Chris@10 901 T3e = T2B - T2C;
Chris@10 902 T4v = T3d + T3e;
Chris@10 903 T3k = T2L + T2O;
Chris@10 904 T3l = T2Q + T2T;
Chris@10 905 T3m = T3k + T3l;
Chris@10 906 T2P = T2L - T2O;
Chris@10 907 T2U = T2Q - T2T;
Chris@10 908 T2V = T2P + T2U;
Chris@10 909 }
Chris@10 910 {
Chris@10 911 E T3y, T2a, T3x, T3O, T3Q, T3G, T3N, T3P, T3z;
Chris@10 912 T3y = KP559016994 * (T1s - T29);
Chris@10 913 T2a = T1s + T29;
Chris@10 914 T3x = FNMS(KP250000000, T2a, TF);
Chris@10 915 T3G = T3C - T3F;
Chris@10 916 T3N = T3J - T3M;
Chris@10 917 T3O = FNMS(KP587785252, T3N, KP951056516 * T3G);
Chris@10 918 T3Q = FMA(KP951056516, T3N, KP587785252 * T3G);
Chris@10 919 ri[WS(rs, 10)] = TF + T2a;
Chris@10 920 T3P = T3y + T3x;
Chris@10 921 ri[WS(rs, 14)] = T3P - T3Q;
Chris@10 922 ri[WS(rs, 6)] = T3P + T3Q;
Chris@10 923 T3z = T3x - T3y;
Chris@10 924 ri[WS(rs, 2)] = T3z - T3O;
Chris@10 925 ri[WS(rs, 18)] = T3z + T3O;
Chris@10 926 }
Chris@10 927 {
Chris@10 928 E T4r, T4p, T4q, T4l, T4u, T4j, T4k, T4t, T4s;
Chris@10 929 T4r = KP559016994 * (T4n - T4o);
Chris@10 930 T4p = T4n + T4o;
Chris@10 931 T4q = FNMS(KP250000000, T4p, T4m);
Chris@10 932 T4j = T1N - T28;
Chris@10 933 T4k = TW - T1r;
Chris@10 934 T4l = FNMS(KP587785252, T4k, KP951056516 * T4j);
Chris@10 935 T4u = FMA(KP951056516, T4k, KP587785252 * T4j);
Chris@10 936 ii[WS(rs, 10)] = T4p + T4m;
Chris@10 937 T4t = T4r + T4q;
Chris@10 938 ii[WS(rs, 6)] = T4t - T4u;
Chris@10 939 ii[WS(rs, 14)] = T4u + T4t;
Chris@10 940 T4s = T4q - T4r;
Chris@10 941 ii[WS(rs, 2)] = T4l + T4s;
Chris@10 942 ii[WS(rs, 18)] = T4s - T4l;
Chris@10 943 }
Chris@10 944 {
Chris@10 945 E T3R, T2i, T3S, T40, T42, T3W, T3Z, T41, T3T;
Chris@10 946 T3R = KP559016994 * (T2e - T2h);
Chris@10 947 T2i = T2e + T2h;
Chris@10 948 T3S = FNMS(KP250000000, T2i, T2b);
Chris@10 949 T3W = T3U - T3V;
Chris@10 950 T3Z = T3X - T3Y;
Chris@10 951 T40 = FMA(KP951056516, T3W, KP587785252 * T3Z);
Chris@10 952 T42 = FNMS(KP587785252, T3W, KP951056516 * T3Z);
Chris@10 953 ri[0] = T2b + T2i;
Chris@10 954 T41 = T3S - T3R;
Chris@10 955 ri[WS(rs, 12)] = T41 - T42;
Chris@10 956 ri[WS(rs, 8)] = T41 + T42;
Chris@10 957 T3T = T3R + T3S;
Chris@10 958 ri[WS(rs, 4)] = T3T - T40;
Chris@10 959 ri[WS(rs, 16)] = T3T + T40;
Chris@10 960 }
Chris@10 961 {
Chris@10 962 E T4e, T45, T4f, T4d, T4i, T4b, T4c, T4h, T4g;
Chris@10 963 T4e = KP559016994 * (T43 - T44);
Chris@10 964 T45 = T43 + T44;
Chris@10 965 T4f = FNMS(KP250000000, T45, T4a);
Chris@10 966 T4b = T2c - T2d;
Chris@10 967 T4c = T2f - T2g;
Chris@10 968 T4d = FMA(KP951056516, T4b, KP587785252 * T4c);
Chris@10 969 T4i = FNMS(KP587785252, T4b, KP951056516 * T4c);
Chris@10 970 ii[0] = T45 + T4a;
Chris@10 971 T4h = T4f - T4e;
Chris@10 972 ii[WS(rs, 8)] = T4h - T4i;
Chris@10 973 ii[WS(rs, 12)] = T4i + T4h;
Chris@10 974 T4g = T4e + T4f;
Chris@10 975 ii[WS(rs, 4)] = T4d + T4g;
Chris@10 976 ii[WS(rs, 16)] = T4g - T4d;
Chris@10 977 }
Chris@10 978 {
Chris@10 979 E T39, T37, T38, T2F, T3b, T2t, T2E, T3c, T3a;
Chris@10 980 T39 = KP559016994 * (T2V - T36);
Chris@10 981 T37 = T2V + T36;
Chris@10 982 T38 = FNMS(KP250000000, T37, T2K);
Chris@10 983 T2t = T2n - T2s;
Chris@10 984 T2E = T2y - T2D;
Chris@10 985 T2F = FNMS(KP587785252, T2E, KP951056516 * T2t);
Chris@10 986 T3b = FMA(KP951056516, T2E, KP587785252 * T2t);
Chris@10 987 ri[WS(rs, 15)] = T2K + T37;
Chris@10 988 T3c = T39 + T38;
Chris@10 989 ri[WS(rs, 11)] = T3b + T3c;
Chris@10 990 ri[WS(rs, 19)] = T3c - T3b;
Chris@10 991 T3a = T38 - T39;
Chris@10 992 ri[WS(rs, 3)] = T2F + T3a;
Chris@10 993 ri[WS(rs, 7)] = T3a - T2F;
Chris@10 994 }
Chris@10 995 {
Chris@10 996 E T4O, T4M, T4N, T4S, T4U, T4Q, T4R, T4T, T4P;
Chris@10 997 T4O = KP559016994 * (T4K - T4L);
Chris@10 998 T4M = T4K + T4L;
Chris@10 999 T4N = FNMS(KP250000000, T4M, T4J);
Chris@10 1000 T4Q = T30 - T35;
Chris@10 1001 T4R = T2P - T2U;
Chris@10 1002 T4S = FNMS(KP587785252, T4R, KP951056516 * T4Q);
Chris@10 1003 T4U = FMA(KP951056516, T4R, KP587785252 * T4Q);
Chris@10 1004 ii[WS(rs, 15)] = T4M + T4J;
Chris@10 1005 T4T = T4O + T4N;
Chris@10 1006 ii[WS(rs, 11)] = T4T - T4U;
Chris@10 1007 ii[WS(rs, 19)] = T4U + T4T;
Chris@10 1008 T4P = T4N - T4O;
Chris@10 1009 ii[WS(rs, 3)] = T4P - T4S;
Chris@10 1010 ii[WS(rs, 7)] = T4S + T4P;
Chris@10 1011 }
Chris@10 1012 {
Chris@10 1013 E T3q, T3s, T3t, T3j, T3v, T3f, T3i, T3w, T3u;
Chris@10 1014 T3q = KP559016994 * (T3m - T3p);
Chris@10 1015 T3s = T3m + T3p;
Chris@10 1016 T3t = FNMS(KP250000000, T3s, T3r);
Chris@10 1017 T3f = T3d - T3e;
Chris@10 1018 T3i = T3g - T3h;
Chris@10 1019 T3j = FMA(KP951056516, T3f, KP587785252 * T3i);
Chris@10 1020 T3v = FNMS(KP587785252, T3f, KP951056516 * T3i);
Chris@10 1021 ri[WS(rs, 5)] = T3r + T3s;
Chris@10 1022 T3w = T3t - T3q;
Chris@10 1023 ri[WS(rs, 13)] = T3v + T3w;
Chris@10 1024 ri[WS(rs, 17)] = T3w - T3v;
Chris@10 1025 T3u = T3q + T3t;
Chris@10 1026 ri[WS(rs, 1)] = T3j + T3u;
Chris@10 1027 ri[WS(rs, 9)] = T3u - T3j;
Chris@10 1028 }
Chris@10 1029 {
Chris@10 1030 E T4x, T4B, T4C, T4G, T4I, T4E, T4F, T4H, T4D;
Chris@10 1031 T4x = KP559016994 * (T4v - T4w);
Chris@10 1032 T4B = T4v + T4w;
Chris@10 1033 T4C = FNMS(KP250000000, T4B, T4A);
Chris@10 1034 T4E = T3k - T3l;
Chris@10 1035 T4F = T3n - T3o;
Chris@10 1036 T4G = FMA(KP951056516, T4E, KP587785252 * T4F);
Chris@10 1037 T4I = FNMS(KP587785252, T4E, KP951056516 * T4F);
Chris@10 1038 ii[WS(rs, 5)] = T4B + T4A;
Chris@10 1039 T4H = T4C - T4x;
Chris@10 1040 ii[WS(rs, 13)] = T4H - T4I;
Chris@10 1041 ii[WS(rs, 17)] = T4I + T4H;
Chris@10 1042 T4D = T4x + T4C;
Chris@10 1043 ii[WS(rs, 1)] = T4D - T4G;
Chris@10 1044 ii[WS(rs, 9)] = T4G + T4D;
Chris@10 1045 }
Chris@10 1046 }
Chris@10 1047 }
Chris@10 1048 }
Chris@10 1049 }
Chris@10 1050
Chris@10 1051 static const tw_instr twinstr[] = {
Chris@10 1052 {TW_CEXP, 0, 1},
Chris@10 1053 {TW_CEXP, 0, 3},
Chris@10 1054 {TW_CEXP, 0, 9},
Chris@10 1055 {TW_CEXP, 0, 19},
Chris@10 1056 {TW_NEXT, 1, 0}
Chris@10 1057 };
Chris@10 1058
Chris@10 1059 static const ct_desc desc = { 20, "t2_20", twinstr, &GENUS, {204, 92, 72, 0}, 0, 0, 0 };
Chris@10 1060
Chris@10 1061 void X(codelet_t2_20) (planner *p) {
Chris@10 1062 X(kdft_dit_register) (p, t2_20, &desc);
Chris@10 1063 }
Chris@10 1064 #endif /* HAVE_FMA */