annotate src/fftw-3.3.3/dft/scalar/codelets/t1_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:35:53 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -name t1_20 -include t.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 246 FP additions, 148 FP multiplications,
Chris@10 32 * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
Chris@10 33 * 97 stack variables, 4 constants, and 80 memory accesses
Chris@10 34 */
Chris@10 35 #include "t.h"
Chris@10 36
Chris@10 37 static void t1_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@10 43 {
Chris@10 44 INT m;
Chris@10 45 for (m = mb, W = W + (mb * 38); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@10 46 E T4P, T4Y, T50, T4U, T4S, T4T, T4Z, T4V;
Chris@10 47 {
Chris@10 48 E T4N, T4r, T8, T2i, T4n, T2n, T4O, Tl, T2v, T3v, T40, T4b, TN, T2b, T3F;
Chris@10 49 E T3i, T2R, T3z, T3W, T4f, T27, T2f, T3J, T3a, T2K, T3y, T3T, T4e, T1G, T2e;
Chris@10 50 E T3I, T33, T2C, T3w, T43, T4c, T1e, T2c, T3G, T3p;
Chris@10 51 {
Chris@10 52 E T1, T4q, T3, T6, T2, T5;
Chris@10 53 T1 = ri[0];
Chris@10 54 T4q = ii[0];
Chris@10 55 T3 = ri[WS(rs, 10)];
Chris@10 56 T6 = ii[WS(rs, 10)];
Chris@10 57 T2 = W[18];
Chris@10 58 T5 = W[19];
Chris@10 59 {
Chris@10 60 E Ta, Td, Tg, T2j, Tb, Tj, Tf, Tc, Ti;
Chris@10 61 {
Chris@10 62 E T4o, T4, T9, T4p, T7;
Chris@10 63 Ta = ri[WS(rs, 5)];
Chris@10 64 Td = ii[WS(rs, 5)];
Chris@10 65 T4o = T2 * T6;
Chris@10 66 T4 = T2 * T3;
Chris@10 67 T9 = W[8];
Chris@10 68 Tg = ri[WS(rs, 15)];
Chris@10 69 T4p = FNMS(T5, T3, T4o);
Chris@10 70 T7 = FMA(T5, T6, T4);
Chris@10 71 T2j = T9 * Td;
Chris@10 72 Tb = T9 * Ta;
Chris@10 73 T4N = T4q - T4p;
Chris@10 74 T4r = T4p + T4q;
Chris@10 75 T8 = T1 + T7;
Chris@10 76 T2i = T1 - T7;
Chris@10 77 Tj = ii[WS(rs, 15)];
Chris@10 78 Tf = W[28];
Chris@10 79 }
Chris@10 80 Tc = W[9];
Chris@10 81 Ti = W[29];
Chris@10 82 {
Chris@10 83 E T3d, Ts, T2t, TL, TB, TE, TD, T3f, Ty, T2q, TC;
Chris@10 84 {
Chris@10 85 E TH, TK, TJ, T2s, TI;
Chris@10 86 {
Chris@10 87 E To, Tr, Tp, T3c, Tq, TG;
Chris@10 88 {
Chris@10 89 E T2k, Te, T2m, Tk, T2l, Th, Tn;
Chris@10 90 To = ri[WS(rs, 4)];
Chris@10 91 T2l = Tf * Tj;
Chris@10 92 Th = Tf * Tg;
Chris@10 93 T2k = FNMS(Tc, Ta, T2j);
Chris@10 94 Te = FMA(Tc, Td, Tb);
Chris@10 95 T2m = FNMS(Ti, Tg, T2l);
Chris@10 96 Tk = FMA(Ti, Tj, Th);
Chris@10 97 Tr = ii[WS(rs, 4)];
Chris@10 98 Tn = W[6];
Chris@10 99 T4n = T2k + T2m;
Chris@10 100 T2n = T2k - T2m;
Chris@10 101 T4O = Te - Tk;
Chris@10 102 Tl = Te + Tk;
Chris@10 103 Tp = Tn * To;
Chris@10 104 T3c = Tn * Tr;
Chris@10 105 }
Chris@10 106 Tq = W[7];
Chris@10 107 TH = ri[WS(rs, 19)];
Chris@10 108 TK = ii[WS(rs, 19)];
Chris@10 109 TG = W[36];
Chris@10 110 T3d = FNMS(Tq, To, T3c);
Chris@10 111 Ts = FMA(Tq, Tr, Tp);
Chris@10 112 TJ = W[37];
Chris@10 113 T2s = TG * TK;
Chris@10 114 TI = TG * TH;
Chris@10 115 }
Chris@10 116 {
Chris@10 117 E Tu, Tx, Tt, Tw, T3e, Tv, TA;
Chris@10 118 Tu = ri[WS(rs, 14)];
Chris@10 119 Tx = ii[WS(rs, 14)];
Chris@10 120 T2t = FNMS(TJ, TH, T2s);
Chris@10 121 TL = FMA(TJ, TK, TI);
Chris@10 122 Tt = W[26];
Chris@10 123 Tw = W[27];
Chris@10 124 TB = ri[WS(rs, 9)];
Chris@10 125 TE = ii[WS(rs, 9)];
Chris@10 126 T3e = Tt * Tx;
Chris@10 127 Tv = Tt * Tu;
Chris@10 128 TA = W[16];
Chris@10 129 TD = W[17];
Chris@10 130 T3f = FNMS(Tw, Tu, T3e);
Chris@10 131 Ty = FMA(Tw, Tx, Tv);
Chris@10 132 T2q = TA * TE;
Chris@10 133 TC = TA * TB;
Chris@10 134 }
Chris@10 135 }
Chris@10 136 {
Chris@10 137 E T3g, T3Y, Tz, T2p, T2r, TF;
Chris@10 138 T3g = T3d - T3f;
Chris@10 139 T3Y = T3d + T3f;
Chris@10 140 Tz = Ts + Ty;
Chris@10 141 T2p = Ts - Ty;
Chris@10 142 T2r = FNMS(TD, TB, T2q);
Chris@10 143 TF = FMA(TD, TE, TC);
Chris@10 144 {
Chris@10 145 E T3Z, T2u, T3h, TM;
Chris@10 146 T3Z = T2r + T2t;
Chris@10 147 T2u = T2r - T2t;
Chris@10 148 T3h = TF - TL;
Chris@10 149 TM = TF + TL;
Chris@10 150 T2v = T2p - T2u;
Chris@10 151 T3v = T2p + T2u;
Chris@10 152 T40 = T3Y - T3Z;
Chris@10 153 T4b = T3Y + T3Z;
Chris@10 154 TN = Tz - TM;
Chris@10 155 T2b = Tz + TM;
Chris@10 156 T3F = T3g - T3h;
Chris@10 157 T3i = T3g + T3h;
Chris@10 158 }
Chris@10 159 }
Chris@10 160 }
Chris@10 161 }
Chris@10 162 }
Chris@10 163 {
Chris@10 164 E T35, T1M, T2P, T25, T1V, T1Y, T1X, T37, T1S, T2M, T1W;
Chris@10 165 {
Chris@10 166 E T21, T24, T23, T2O, T22;
Chris@10 167 {
Chris@10 168 E T1I, T1L, T1H, T1K, T34, T1J, T20;
Chris@10 169 T1I = ri[WS(rs, 12)];
Chris@10 170 T1L = ii[WS(rs, 12)];
Chris@10 171 T1H = W[22];
Chris@10 172 T1K = W[23];
Chris@10 173 T21 = ri[WS(rs, 7)];
Chris@10 174 T24 = ii[WS(rs, 7)];
Chris@10 175 T34 = T1H * T1L;
Chris@10 176 T1J = T1H * T1I;
Chris@10 177 T20 = W[12];
Chris@10 178 T23 = W[13];
Chris@10 179 T35 = FNMS(T1K, T1I, T34);
Chris@10 180 T1M = FMA(T1K, T1L, T1J);
Chris@10 181 T2O = T20 * T24;
Chris@10 182 T22 = T20 * T21;
Chris@10 183 }
Chris@10 184 {
Chris@10 185 E T1O, T1R, T1N, T1Q, T36, T1P, T1U;
Chris@10 186 T1O = ri[WS(rs, 2)];
Chris@10 187 T1R = ii[WS(rs, 2)];
Chris@10 188 T2P = FNMS(T23, T21, T2O);
Chris@10 189 T25 = FMA(T23, T24, T22);
Chris@10 190 T1N = W[2];
Chris@10 191 T1Q = W[3];
Chris@10 192 T1V = ri[WS(rs, 17)];
Chris@10 193 T1Y = ii[WS(rs, 17)];
Chris@10 194 T36 = T1N * T1R;
Chris@10 195 T1P = T1N * T1O;
Chris@10 196 T1U = W[32];
Chris@10 197 T1X = W[33];
Chris@10 198 T37 = FNMS(T1Q, T1O, T36);
Chris@10 199 T1S = FMA(T1Q, T1R, T1P);
Chris@10 200 T2M = T1U * T1Y;
Chris@10 201 T1W = T1U * T1V;
Chris@10 202 }
Chris@10 203 }
Chris@10 204 {
Chris@10 205 E T38, T3U, T1T, T2L, T2N, T1Z;
Chris@10 206 T38 = T35 - T37;
Chris@10 207 T3U = T35 + T37;
Chris@10 208 T1T = T1M + T1S;
Chris@10 209 T2L = T1M - T1S;
Chris@10 210 T2N = FNMS(T1X, T1V, T2M);
Chris@10 211 T1Z = FMA(T1X, T1Y, T1W);
Chris@10 212 {
Chris@10 213 E T3V, T2Q, T39, T26;
Chris@10 214 T3V = T2N + T2P;
Chris@10 215 T2Q = T2N - T2P;
Chris@10 216 T39 = T1Z - T25;
Chris@10 217 T26 = T1Z + T25;
Chris@10 218 T2R = T2L - T2Q;
Chris@10 219 T3z = T2L + T2Q;
Chris@10 220 T3W = T3U - T3V;
Chris@10 221 T4f = T3U + T3V;
Chris@10 222 T27 = T1T - T26;
Chris@10 223 T2f = T1T + T26;
Chris@10 224 T3J = T38 - T39;
Chris@10 225 T3a = T38 + T39;
Chris@10 226 }
Chris@10 227 }
Chris@10 228 }
Chris@10 229 {
Chris@10 230 E T2Y, T1l, T2I, T1E, T1u, T1x, T1w, T30, T1r, T2F, T1v;
Chris@10 231 {
Chris@10 232 E T1A, T1D, T1C, T2H, T1B;
Chris@10 233 {
Chris@10 234 E T1h, T1k, T1g, T1j, T2X, T1i, T1z;
Chris@10 235 T1h = ri[WS(rs, 8)];
Chris@10 236 T1k = ii[WS(rs, 8)];
Chris@10 237 T1g = W[14];
Chris@10 238 T1j = W[15];
Chris@10 239 T1A = ri[WS(rs, 3)];
Chris@10 240 T1D = ii[WS(rs, 3)];
Chris@10 241 T2X = T1g * T1k;
Chris@10 242 T1i = T1g * T1h;
Chris@10 243 T1z = W[4];
Chris@10 244 T1C = W[5];
Chris@10 245 T2Y = FNMS(T1j, T1h, T2X);
Chris@10 246 T1l = FMA(T1j, T1k, T1i);
Chris@10 247 T2H = T1z * T1D;
Chris@10 248 T1B = T1z * T1A;
Chris@10 249 }
Chris@10 250 {
Chris@10 251 E T1n, T1q, T1m, T1p, T2Z, T1o, T1t;
Chris@10 252 T1n = ri[WS(rs, 18)];
Chris@10 253 T1q = ii[WS(rs, 18)];
Chris@10 254 T2I = FNMS(T1C, T1A, T2H);
Chris@10 255 T1E = FMA(T1C, T1D, T1B);
Chris@10 256 T1m = W[34];
Chris@10 257 T1p = W[35];
Chris@10 258 T1u = ri[WS(rs, 13)];
Chris@10 259 T1x = ii[WS(rs, 13)];
Chris@10 260 T2Z = T1m * T1q;
Chris@10 261 T1o = T1m * T1n;
Chris@10 262 T1t = W[24];
Chris@10 263 T1w = W[25];
Chris@10 264 T30 = FNMS(T1p, T1n, T2Z);
Chris@10 265 T1r = FMA(T1p, T1q, T1o);
Chris@10 266 T2F = T1t * T1x;
Chris@10 267 T1v = T1t * T1u;
Chris@10 268 }
Chris@10 269 }
Chris@10 270 {
Chris@10 271 E T31, T3R, T1s, T2E, T2G, T1y;
Chris@10 272 T31 = T2Y - T30;
Chris@10 273 T3R = T2Y + T30;
Chris@10 274 T1s = T1l + T1r;
Chris@10 275 T2E = T1l - T1r;
Chris@10 276 T2G = FNMS(T1w, T1u, T2F);
Chris@10 277 T1y = FMA(T1w, T1x, T1v);
Chris@10 278 {
Chris@10 279 E T3S, T2J, T32, T1F;
Chris@10 280 T3S = T2G + T2I;
Chris@10 281 T2J = T2G - T2I;
Chris@10 282 T32 = T1y - T1E;
Chris@10 283 T1F = T1y + T1E;
Chris@10 284 T2K = T2E - T2J;
Chris@10 285 T3y = T2E + T2J;
Chris@10 286 T3T = T3R - T3S;
Chris@10 287 T4e = T3R + T3S;
Chris@10 288 T1G = T1s - T1F;
Chris@10 289 T2e = T1s + T1F;
Chris@10 290 T3I = T31 - T32;
Chris@10 291 T33 = T31 + T32;
Chris@10 292 }
Chris@10 293 }
Chris@10 294 }
Chris@10 295 {
Chris@10 296 E T3k, TT, T2A, T1c, T12, T15, T14, T3m, TZ, T2x, T13;
Chris@10 297 {
Chris@10 298 E T18, T1b, T1a, T2z, T19;
Chris@10 299 {
Chris@10 300 E TP, TS, TO, TR, T3j, TQ, T17;
Chris@10 301 TP = ri[WS(rs, 16)];
Chris@10 302 TS = ii[WS(rs, 16)];
Chris@10 303 TO = W[30];
Chris@10 304 TR = W[31];
Chris@10 305 T18 = ri[WS(rs, 11)];
Chris@10 306 T1b = ii[WS(rs, 11)];
Chris@10 307 T3j = TO * TS;
Chris@10 308 TQ = TO * TP;
Chris@10 309 T17 = W[20];
Chris@10 310 T1a = W[21];
Chris@10 311 T3k = FNMS(TR, TP, T3j);
Chris@10 312 TT = FMA(TR, TS, TQ);
Chris@10 313 T2z = T17 * T1b;
Chris@10 314 T19 = T17 * T18;
Chris@10 315 }
Chris@10 316 {
Chris@10 317 E TV, TY, TU, TX, T3l, TW, T11;
Chris@10 318 TV = ri[WS(rs, 6)];
Chris@10 319 TY = ii[WS(rs, 6)];
Chris@10 320 T2A = FNMS(T1a, T18, T2z);
Chris@10 321 T1c = FMA(T1a, T1b, T19);
Chris@10 322 TU = W[10];
Chris@10 323 TX = W[11];
Chris@10 324 T12 = ri[WS(rs, 1)];
Chris@10 325 T15 = ii[WS(rs, 1)];
Chris@10 326 T3l = TU * TY;
Chris@10 327 TW = TU * TV;
Chris@10 328 T11 = W[0];
Chris@10 329 T14 = W[1];
Chris@10 330 T3m = FNMS(TX, TV, T3l);
Chris@10 331 TZ = FMA(TX, TY, TW);
Chris@10 332 T2x = T11 * T15;
Chris@10 333 T13 = T11 * T12;
Chris@10 334 }
Chris@10 335 }
Chris@10 336 {
Chris@10 337 E T3n, T41, T10, T2w, T2y, T16;
Chris@10 338 T3n = T3k - T3m;
Chris@10 339 T41 = T3k + T3m;
Chris@10 340 T10 = TT + TZ;
Chris@10 341 T2w = TT - TZ;
Chris@10 342 T2y = FNMS(T14, T12, T2x);
Chris@10 343 T16 = FMA(T14, T15, T13);
Chris@10 344 {
Chris@10 345 E T42, T2B, T3o, T1d;
Chris@10 346 T42 = T2y + T2A;
Chris@10 347 T2B = T2y - T2A;
Chris@10 348 T3o = T16 - T1c;
Chris@10 349 T1d = T16 + T1c;
Chris@10 350 T2C = T2w - T2B;
Chris@10 351 T3w = T2w + T2B;
Chris@10 352 T43 = T41 - T42;
Chris@10 353 T4c = T41 + T42;
Chris@10 354 T1e = T10 - T1d;
Chris@10 355 T2c = T10 + T1d;
Chris@10 356 T3G = T3n - T3o;
Chris@10 357 T3p = T3n + T3o;
Chris@10 358 }
Chris@10 359 }
Chris@10 360 }
Chris@10 361 {
Chris@10 362 E T4s, T4k, T4l, T4h, T4j, T49, T4y, T4A, T48;
Chris@10 363 {
Chris@10 364 E T4D, T4C, T2a, T47, T45, T4B, T4M, T4K, T46, T3Q;
Chris@10 365 {
Chris@10 366 E Tm, T1f, T4J, T4I, T28, T3X, T44, T29, T3P, T3O;
Chris@10 367 T4D = T3T + T3W;
Chris@10 368 T3X = T3T - T3W;
Chris@10 369 T44 = T40 - T43;
Chris@10 370 T4C = T40 + T43;
Chris@10 371 T2a = T8 + Tl;
Chris@10 372 Tm = T8 - Tl;
Chris@10 373 T1f = TN + T1e;
Chris@10 374 T4J = TN - T1e;
Chris@10 375 T4I = T1G - T27;
Chris@10 376 T28 = T1G + T27;
Chris@10 377 T47 = FMA(KP618033988, T3X, T44);
Chris@10 378 T45 = FNMS(KP618033988, T44, T3X);
Chris@10 379 T29 = T1f + T28;
Chris@10 380 T3P = T1f - T28;
Chris@10 381 T4B = T4r - T4n;
Chris@10 382 T4s = T4n + T4r;
Chris@10 383 ri[WS(rs, 10)] = Tm + T29;
Chris@10 384 T3O = FNMS(KP250000000, T29, Tm);
Chris@10 385 T4M = FMA(KP618033988, T4I, T4J);
Chris@10 386 T4K = FNMS(KP618033988, T4J, T4I);
Chris@10 387 T46 = FMA(KP559016994, T3P, T3O);
Chris@10 388 T3Q = FNMS(KP559016994, T3P, T3O);
Chris@10 389 }
Chris@10 390 {
Chris@10 391 E T2d, T4w, T4x, T2g, T2h;
Chris@10 392 {
Chris@10 393 E T4d, T4G, T4F, T4g, T4E, T4L, T4H;
Chris@10 394 T4k = T4b + T4c;
Chris@10 395 T4d = T4b - T4c;
Chris@10 396 T4G = T4C - T4D;
Chris@10 397 T4E = T4C + T4D;
Chris@10 398 ri[WS(rs, 18)] = FMA(KP951056516, T45, T3Q);
Chris@10 399 ri[WS(rs, 2)] = FNMS(KP951056516, T45, T3Q);
Chris@10 400 ri[WS(rs, 6)] = FMA(KP951056516, T47, T46);
Chris@10 401 ri[WS(rs, 14)] = FNMS(KP951056516, T47, T46);
Chris@10 402 ii[WS(rs, 10)] = T4E + T4B;
Chris@10 403 T4F = FNMS(KP250000000, T4E, T4B);
Chris@10 404 T4g = T4e - T4f;
Chris@10 405 T4l = T4e + T4f;
Chris@10 406 T2d = T2b + T2c;
Chris@10 407 T4w = T2b - T2c;
Chris@10 408 T4L = FMA(KP559016994, T4G, T4F);
Chris@10 409 T4H = FNMS(KP559016994, T4G, T4F);
Chris@10 410 T4h = FMA(KP618033988, T4g, T4d);
Chris@10 411 T4j = FNMS(KP618033988, T4d, T4g);
Chris@10 412 ii[WS(rs, 18)] = FNMS(KP951056516, T4K, T4H);
Chris@10 413 ii[WS(rs, 2)] = FMA(KP951056516, T4K, T4H);
Chris@10 414 ii[WS(rs, 14)] = FMA(KP951056516, T4M, T4L);
Chris@10 415 ii[WS(rs, 6)] = FNMS(KP951056516, T4M, T4L);
Chris@10 416 T4x = T2e - T2f;
Chris@10 417 T2g = T2e + T2f;
Chris@10 418 }
Chris@10 419 T2h = T2d + T2g;
Chris@10 420 T49 = T2d - T2g;
Chris@10 421 T4y = FMA(KP618033988, T4x, T4w);
Chris@10 422 T4A = FNMS(KP618033988, T4w, T4x);
Chris@10 423 ri[0] = T2a + T2h;
Chris@10 424 T48 = FNMS(KP250000000, T2h, T2a);
Chris@10 425 }
Chris@10 426 }
Chris@10 427 {
Chris@10 428 E T3u, T51, T5a, T5c, T56, T54;
Chris@10 429 {
Chris@10 430 E T53, T52, T3t, T3r, T2o, T59, T58, T2T, T2V, T4u, T4t, T2U, T3s, T2W;
Chris@10 431 {
Chris@10 432 E T3b, T3q, T4i, T4a, T4m;
Chris@10 433 T53 = T33 + T3a;
Chris@10 434 T3b = T33 - T3a;
Chris@10 435 T3q = T3i - T3p;
Chris@10 436 T52 = T3i + T3p;
Chris@10 437 T4i = FNMS(KP559016994, T49, T48);
Chris@10 438 T4a = FMA(KP559016994, T49, T48);
Chris@10 439 T4m = T4k + T4l;
Chris@10 440 T4u = T4k - T4l;
Chris@10 441 ri[WS(rs, 16)] = FMA(KP951056516, T4h, T4a);
Chris@10 442 ri[WS(rs, 4)] = FNMS(KP951056516, T4h, T4a);
Chris@10 443 ri[WS(rs, 8)] = FMA(KP951056516, T4j, T4i);
Chris@10 444 ri[WS(rs, 12)] = FNMS(KP951056516, T4j, T4i);
Chris@10 445 ii[0] = T4m + T4s;
Chris@10 446 T4t = FNMS(KP250000000, T4m, T4s);
Chris@10 447 T3t = FMA(KP618033988, T3b, T3q);
Chris@10 448 T3r = FNMS(KP618033988, T3q, T3b);
Chris@10 449 }
Chris@10 450 T3u = T2i + T2n;
Chris@10 451 T2o = T2i - T2n;
Chris@10 452 {
Chris@10 453 E T4v, T4z, T2D, T2S;
Chris@10 454 T4v = FMA(KP559016994, T4u, T4t);
Chris@10 455 T4z = FNMS(KP559016994, T4u, T4t);
Chris@10 456 T2D = T2v + T2C;
Chris@10 457 T59 = T2v - T2C;
Chris@10 458 T58 = T2K - T2R;
Chris@10 459 T2S = T2K + T2R;
Chris@10 460 ii[WS(rs, 16)] = FNMS(KP951056516, T4y, T4v);
Chris@10 461 ii[WS(rs, 4)] = FMA(KP951056516, T4y, T4v);
Chris@10 462 ii[WS(rs, 12)] = FMA(KP951056516, T4A, T4z);
Chris@10 463 ii[WS(rs, 8)] = FNMS(KP951056516, T4A, T4z);
Chris@10 464 T2T = T2D + T2S;
Chris@10 465 T2V = T2D - T2S;
Chris@10 466 }
Chris@10 467 ri[WS(rs, 15)] = T2o + T2T;
Chris@10 468 T2U = FNMS(KP250000000, T2T, T2o);
Chris@10 469 T51 = T4O + T4N;
Chris@10 470 T4P = T4N - T4O;
Chris@10 471 T5a = FNMS(KP618033988, T59, T58);
Chris@10 472 T5c = FMA(KP618033988, T58, T59);
Chris@10 473 T3s = FMA(KP559016994, T2V, T2U);
Chris@10 474 T2W = FNMS(KP559016994, T2V, T2U);
Chris@10 475 ri[WS(rs, 7)] = FNMS(KP951056516, T3r, T2W);
Chris@10 476 ri[WS(rs, 3)] = FMA(KP951056516, T3r, T2W);
Chris@10 477 ri[WS(rs, 19)] = FNMS(KP951056516, T3t, T3s);
Chris@10 478 ri[WS(rs, 11)] = FMA(KP951056516, T3t, T3s);
Chris@10 479 T56 = T52 - T53;
Chris@10 480 T54 = T52 + T53;
Chris@10 481 }
Chris@10 482 {
Chris@10 483 E T4Q, T4R, T3N, T3L, T4W, T4X, T3B, T3D, T3H, T3K, T55, T3C, T3M, T3E;
Chris@10 484 T4Q = T3F + T3G;
Chris@10 485 T3H = T3F - T3G;
Chris@10 486 T3K = T3I - T3J;
Chris@10 487 T4R = T3I + T3J;
Chris@10 488 ii[WS(rs, 15)] = T54 + T51;
Chris@10 489 T55 = FNMS(KP250000000, T54, T51);
Chris@10 490 T3N = FNMS(KP618033988, T3H, T3K);
Chris@10 491 T3L = FMA(KP618033988, T3K, T3H);
Chris@10 492 {
Chris@10 493 E T57, T5b, T3x, T3A;
Chris@10 494 T57 = FNMS(KP559016994, T56, T55);
Chris@10 495 T5b = FMA(KP559016994, T56, T55);
Chris@10 496 T3x = T3v + T3w;
Chris@10 497 T4W = T3v - T3w;
Chris@10 498 T4X = T3y - T3z;
Chris@10 499 T3A = T3y + T3z;
Chris@10 500 ii[WS(rs, 7)] = FMA(KP951056516, T5a, T57);
Chris@10 501 ii[WS(rs, 3)] = FNMS(KP951056516, T5a, T57);
Chris@10 502 ii[WS(rs, 19)] = FMA(KP951056516, T5c, T5b);
Chris@10 503 ii[WS(rs, 11)] = FNMS(KP951056516, T5c, T5b);
Chris@10 504 T3B = T3x + T3A;
Chris@10 505 T3D = T3x - T3A;
Chris@10 506 }
Chris@10 507 ri[WS(rs, 5)] = T3u + T3B;
Chris@10 508 T3C = FNMS(KP250000000, T3B, T3u);
Chris@10 509 T4Y = FMA(KP618033988, T4X, T4W);
Chris@10 510 T50 = FNMS(KP618033988, T4W, T4X);
Chris@10 511 T3M = FNMS(KP559016994, T3D, T3C);
Chris@10 512 T3E = FMA(KP559016994, T3D, T3C);
Chris@10 513 ri[WS(rs, 9)] = FNMS(KP951056516, T3L, T3E);
Chris@10 514 ri[WS(rs, 1)] = FMA(KP951056516, T3L, T3E);
Chris@10 515 ri[WS(rs, 17)] = FNMS(KP951056516, T3N, T3M);
Chris@10 516 ri[WS(rs, 13)] = FMA(KP951056516, T3N, T3M);
Chris@10 517 T4U = T4Q - T4R;
Chris@10 518 T4S = T4Q + T4R;
Chris@10 519 }
Chris@10 520 }
Chris@10 521 }
Chris@10 522 }
Chris@10 523 ii[WS(rs, 5)] = T4S + T4P;
Chris@10 524 T4T = FNMS(KP250000000, T4S, T4P);
Chris@10 525 T4Z = FNMS(KP559016994, T4U, T4T);
Chris@10 526 T4V = FMA(KP559016994, T4U, T4T);
Chris@10 527 ii[WS(rs, 9)] = FMA(KP951056516, T4Y, T4V);
Chris@10 528 ii[WS(rs, 1)] = FNMS(KP951056516, T4Y, T4V);
Chris@10 529 ii[WS(rs, 17)] = FMA(KP951056516, T50, T4Z);
Chris@10 530 ii[WS(rs, 13)] = FNMS(KP951056516, T50, T4Z);
Chris@10 531 }
Chris@10 532 }
Chris@10 533 }
Chris@10 534
Chris@10 535 static const tw_instr twinstr[] = {
Chris@10 536 {TW_FULL, 0, 20},
Chris@10 537 {TW_NEXT, 1, 0}
Chris@10 538 };
Chris@10 539
Chris@10 540 static const ct_desc desc = { 20, "t1_20", twinstr, &GENUS, {136, 38, 110, 0}, 0, 0, 0 };
Chris@10 541
Chris@10 542 void X(codelet_t1_20) (planner *p) {
Chris@10 543 X(kdft_dit_register) (p, t1_20, &desc);
Chris@10 544 }
Chris@10 545 #else /* HAVE_FMA */
Chris@10 546
Chris@10 547 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 20 -name t1_20 -include t.h */
Chris@10 548
Chris@10 549 /*
Chris@10 550 * This function contains 246 FP additions, 124 FP multiplications,
Chris@10 551 * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
Chris@10 552 * 85 stack variables, 4 constants, and 80 memory accesses
Chris@10 553 */
Chris@10 554 #include "t.h"
Chris@10 555
Chris@10 556 static void t1_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 557 {
Chris@10 558 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@10 559 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 560 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 561 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 562 {
Chris@10 563 INT m;
Chris@10 564 for (m = mb, W = W + (mb * 38); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@10 565 E Tj, T1R, T4g, T4p, T2q, T37, T3Q, T42, T1r, T1O, T1P, T3i, T3l, T44, T3D;
Chris@10 566 E T3E, T3K, T1V, T1W, T1X, T23, T28, T4r, T2W, T2X, T4c, T33, T34, T35, T2G;
Chris@10 567 E T2L, T2M, TG, T13, T14, T3p, T3s, T43, T3A, T3B, T3J, T1S, T1T, T1U, T2e;
Chris@10 568 E T2j, T4q, T2T, T2U, T4b, T30, T31, T32, T2v, T2A, T2B;
Chris@10 569 {
Chris@10 570 E T1, T3O, T6, T3N, Tc, T2n, Th, T2o;
Chris@10 571 T1 = ri[0];
Chris@10 572 T3O = ii[0];
Chris@10 573 {
Chris@10 574 E T3, T5, T2, T4;
Chris@10 575 T3 = ri[WS(rs, 10)];
Chris@10 576 T5 = ii[WS(rs, 10)];
Chris@10 577 T2 = W[18];
Chris@10 578 T4 = W[19];
Chris@10 579 T6 = FMA(T2, T3, T4 * T5);
Chris@10 580 T3N = FNMS(T4, T3, T2 * T5);
Chris@10 581 }
Chris@10 582 {
Chris@10 583 E T9, Tb, T8, Ta;
Chris@10 584 T9 = ri[WS(rs, 5)];
Chris@10 585 Tb = ii[WS(rs, 5)];
Chris@10 586 T8 = W[8];
Chris@10 587 Ta = W[9];
Chris@10 588 Tc = FMA(T8, T9, Ta * Tb);
Chris@10 589 T2n = FNMS(Ta, T9, T8 * Tb);
Chris@10 590 }
Chris@10 591 {
Chris@10 592 E Te, Tg, Td, Tf;
Chris@10 593 Te = ri[WS(rs, 15)];
Chris@10 594 Tg = ii[WS(rs, 15)];
Chris@10 595 Td = W[28];
Chris@10 596 Tf = W[29];
Chris@10 597 Th = FMA(Td, Te, Tf * Tg);
Chris@10 598 T2o = FNMS(Tf, Te, Td * Tg);
Chris@10 599 }
Chris@10 600 {
Chris@10 601 E T7, Ti, T4e, T4f;
Chris@10 602 T7 = T1 + T6;
Chris@10 603 Ti = Tc + Th;
Chris@10 604 Tj = T7 - Ti;
Chris@10 605 T1R = T7 + Ti;
Chris@10 606 T4e = T3O - T3N;
Chris@10 607 T4f = Tc - Th;
Chris@10 608 T4g = T4e - T4f;
Chris@10 609 T4p = T4f + T4e;
Chris@10 610 }
Chris@10 611 {
Chris@10 612 E T2m, T2p, T3M, T3P;
Chris@10 613 T2m = T1 - T6;
Chris@10 614 T2p = T2n - T2o;
Chris@10 615 T2q = T2m - T2p;
Chris@10 616 T37 = T2m + T2p;
Chris@10 617 T3M = T2n + T2o;
Chris@10 618 T3P = T3N + T3O;
Chris@10 619 T3Q = T3M + T3P;
Chris@10 620 T42 = T3P - T3M;
Chris@10 621 }
Chris@10 622 }
Chris@10 623 {
Chris@10 624 E T1f, T3g, T21, T2C, T1N, T3k, T27, T2K, T1q, T3h, T22, T2F, T1C, T3j, T26;
Chris@10 625 E T2H;
Chris@10 626 {
Chris@10 627 E T19, T1Z, T1e, T20;
Chris@10 628 {
Chris@10 629 E T16, T18, T15, T17;
Chris@10 630 T16 = ri[WS(rs, 8)];
Chris@10 631 T18 = ii[WS(rs, 8)];
Chris@10 632 T15 = W[14];
Chris@10 633 T17 = W[15];
Chris@10 634 T19 = FMA(T15, T16, T17 * T18);
Chris@10 635 T1Z = FNMS(T17, T16, T15 * T18);
Chris@10 636 }
Chris@10 637 {
Chris@10 638 E T1b, T1d, T1a, T1c;
Chris@10 639 T1b = ri[WS(rs, 18)];
Chris@10 640 T1d = ii[WS(rs, 18)];
Chris@10 641 T1a = W[34];
Chris@10 642 T1c = W[35];
Chris@10 643 T1e = FMA(T1a, T1b, T1c * T1d);
Chris@10 644 T20 = FNMS(T1c, T1b, T1a * T1d);
Chris@10 645 }
Chris@10 646 T1f = T19 + T1e;
Chris@10 647 T3g = T1Z + T20;
Chris@10 648 T21 = T1Z - T20;
Chris@10 649 T2C = T19 - T1e;
Chris@10 650 }
Chris@10 651 {
Chris@10 652 E T1H, T2I, T1M, T2J;
Chris@10 653 {
Chris@10 654 E T1E, T1G, T1D, T1F;
Chris@10 655 T1E = ri[WS(rs, 17)];
Chris@10 656 T1G = ii[WS(rs, 17)];
Chris@10 657 T1D = W[32];
Chris@10 658 T1F = W[33];
Chris@10 659 T1H = FMA(T1D, T1E, T1F * T1G);
Chris@10 660 T2I = FNMS(T1F, T1E, T1D * T1G);
Chris@10 661 }
Chris@10 662 {
Chris@10 663 E T1J, T1L, T1I, T1K;
Chris@10 664 T1J = ri[WS(rs, 7)];
Chris@10 665 T1L = ii[WS(rs, 7)];
Chris@10 666 T1I = W[12];
Chris@10 667 T1K = W[13];
Chris@10 668 T1M = FMA(T1I, T1J, T1K * T1L);
Chris@10 669 T2J = FNMS(T1K, T1J, T1I * T1L);
Chris@10 670 }
Chris@10 671 T1N = T1H + T1M;
Chris@10 672 T3k = T2I + T2J;
Chris@10 673 T27 = T1H - T1M;
Chris@10 674 T2K = T2I - T2J;
Chris@10 675 }
Chris@10 676 {
Chris@10 677 E T1k, T2D, T1p, T2E;
Chris@10 678 {
Chris@10 679 E T1h, T1j, T1g, T1i;
Chris@10 680 T1h = ri[WS(rs, 13)];
Chris@10 681 T1j = ii[WS(rs, 13)];
Chris@10 682 T1g = W[24];
Chris@10 683 T1i = W[25];
Chris@10 684 T1k = FMA(T1g, T1h, T1i * T1j);
Chris@10 685 T2D = FNMS(T1i, T1h, T1g * T1j);
Chris@10 686 }
Chris@10 687 {
Chris@10 688 E T1m, T1o, T1l, T1n;
Chris@10 689 T1m = ri[WS(rs, 3)];
Chris@10 690 T1o = ii[WS(rs, 3)];
Chris@10 691 T1l = W[4];
Chris@10 692 T1n = W[5];
Chris@10 693 T1p = FMA(T1l, T1m, T1n * T1o);
Chris@10 694 T2E = FNMS(T1n, T1m, T1l * T1o);
Chris@10 695 }
Chris@10 696 T1q = T1k + T1p;
Chris@10 697 T3h = T2D + T2E;
Chris@10 698 T22 = T1k - T1p;
Chris@10 699 T2F = T2D - T2E;
Chris@10 700 }
Chris@10 701 {
Chris@10 702 E T1w, T24, T1B, T25;
Chris@10 703 {
Chris@10 704 E T1t, T1v, T1s, T1u;
Chris@10 705 T1t = ri[WS(rs, 12)];
Chris@10 706 T1v = ii[WS(rs, 12)];
Chris@10 707 T1s = W[22];
Chris@10 708 T1u = W[23];
Chris@10 709 T1w = FMA(T1s, T1t, T1u * T1v);
Chris@10 710 T24 = FNMS(T1u, T1t, T1s * T1v);
Chris@10 711 }
Chris@10 712 {
Chris@10 713 E T1y, T1A, T1x, T1z;
Chris@10 714 T1y = ri[WS(rs, 2)];
Chris@10 715 T1A = ii[WS(rs, 2)];
Chris@10 716 T1x = W[2];
Chris@10 717 T1z = W[3];
Chris@10 718 T1B = FMA(T1x, T1y, T1z * T1A);
Chris@10 719 T25 = FNMS(T1z, T1y, T1x * T1A);
Chris@10 720 }
Chris@10 721 T1C = T1w + T1B;
Chris@10 722 T3j = T24 + T25;
Chris@10 723 T26 = T24 - T25;
Chris@10 724 T2H = T1w - T1B;
Chris@10 725 }
Chris@10 726 T1r = T1f - T1q;
Chris@10 727 T1O = T1C - T1N;
Chris@10 728 T1P = T1r + T1O;
Chris@10 729 T3i = T3g - T3h;
Chris@10 730 T3l = T3j - T3k;
Chris@10 731 T44 = T3i + T3l;
Chris@10 732 T3D = T3g + T3h;
Chris@10 733 T3E = T3j + T3k;
Chris@10 734 T3K = T3D + T3E;
Chris@10 735 T1V = T1f + T1q;
Chris@10 736 T1W = T1C + T1N;
Chris@10 737 T1X = T1V + T1W;
Chris@10 738 T23 = T21 + T22;
Chris@10 739 T28 = T26 + T27;
Chris@10 740 T4r = T23 + T28;
Chris@10 741 T2W = T21 - T22;
Chris@10 742 T2X = T26 - T27;
Chris@10 743 T4c = T2W + T2X;
Chris@10 744 T33 = T2C + T2F;
Chris@10 745 T34 = T2H + T2K;
Chris@10 746 T35 = T33 + T34;
Chris@10 747 T2G = T2C - T2F;
Chris@10 748 T2L = T2H - T2K;
Chris@10 749 T2M = T2G + T2L;
Chris@10 750 }
Chris@10 751 {
Chris@10 752 E Tu, T3n, T2c, T2r, T12, T3r, T2i, T2z, TF, T3o, T2d, T2u, TR, T3q, T2h;
Chris@10 753 E T2w;
Chris@10 754 {
Chris@10 755 E To, T2a, Tt, T2b;
Chris@10 756 {
Chris@10 757 E Tl, Tn, Tk, Tm;
Chris@10 758 Tl = ri[WS(rs, 4)];
Chris@10 759 Tn = ii[WS(rs, 4)];
Chris@10 760 Tk = W[6];
Chris@10 761 Tm = W[7];
Chris@10 762 To = FMA(Tk, Tl, Tm * Tn);
Chris@10 763 T2a = FNMS(Tm, Tl, Tk * Tn);
Chris@10 764 }
Chris@10 765 {
Chris@10 766 E Tq, Ts, Tp, Tr;
Chris@10 767 Tq = ri[WS(rs, 14)];
Chris@10 768 Ts = ii[WS(rs, 14)];
Chris@10 769 Tp = W[26];
Chris@10 770 Tr = W[27];
Chris@10 771 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@10 772 T2b = FNMS(Tr, Tq, Tp * Ts);
Chris@10 773 }
Chris@10 774 Tu = To + Tt;
Chris@10 775 T3n = T2a + T2b;
Chris@10 776 T2c = T2a - T2b;
Chris@10 777 T2r = To - Tt;
Chris@10 778 }
Chris@10 779 {
Chris@10 780 E TW, T2x, T11, T2y;
Chris@10 781 {
Chris@10 782 E TT, TV, TS, TU;
Chris@10 783 TT = ri[WS(rs, 1)];
Chris@10 784 TV = ii[WS(rs, 1)];
Chris@10 785 TS = W[0];
Chris@10 786 TU = W[1];
Chris@10 787 TW = FMA(TS, TT, TU * TV);
Chris@10 788 T2x = FNMS(TU, TT, TS * TV);
Chris@10 789 }
Chris@10 790 {
Chris@10 791 E TY, T10, TX, TZ;
Chris@10 792 TY = ri[WS(rs, 11)];
Chris@10 793 T10 = ii[WS(rs, 11)];
Chris@10 794 TX = W[20];
Chris@10 795 TZ = W[21];
Chris@10 796 T11 = FMA(TX, TY, TZ * T10);
Chris@10 797 T2y = FNMS(TZ, TY, TX * T10);
Chris@10 798 }
Chris@10 799 T12 = TW + T11;
Chris@10 800 T3r = T2x + T2y;
Chris@10 801 T2i = TW - T11;
Chris@10 802 T2z = T2x - T2y;
Chris@10 803 }
Chris@10 804 {
Chris@10 805 E Tz, T2s, TE, T2t;
Chris@10 806 {
Chris@10 807 E Tw, Ty, Tv, Tx;
Chris@10 808 Tw = ri[WS(rs, 9)];
Chris@10 809 Ty = ii[WS(rs, 9)];
Chris@10 810 Tv = W[16];
Chris@10 811 Tx = W[17];
Chris@10 812 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@10 813 T2s = FNMS(Tx, Tw, Tv * Ty);
Chris@10 814 }
Chris@10 815 {
Chris@10 816 E TB, TD, TA, TC;
Chris@10 817 TB = ri[WS(rs, 19)];
Chris@10 818 TD = ii[WS(rs, 19)];
Chris@10 819 TA = W[36];
Chris@10 820 TC = W[37];
Chris@10 821 TE = FMA(TA, TB, TC * TD);
Chris@10 822 T2t = FNMS(TC, TB, TA * TD);
Chris@10 823 }
Chris@10 824 TF = Tz + TE;
Chris@10 825 T3o = T2s + T2t;
Chris@10 826 T2d = Tz - TE;
Chris@10 827 T2u = T2s - T2t;
Chris@10 828 }
Chris@10 829 {
Chris@10 830 E TL, T2f, TQ, T2g;
Chris@10 831 {
Chris@10 832 E TI, TK, TH, TJ;
Chris@10 833 TI = ri[WS(rs, 16)];
Chris@10 834 TK = ii[WS(rs, 16)];
Chris@10 835 TH = W[30];
Chris@10 836 TJ = W[31];
Chris@10 837 TL = FMA(TH, TI, TJ * TK);
Chris@10 838 T2f = FNMS(TJ, TI, TH * TK);
Chris@10 839 }
Chris@10 840 {
Chris@10 841 E TN, TP, TM, TO;
Chris@10 842 TN = ri[WS(rs, 6)];
Chris@10 843 TP = ii[WS(rs, 6)];
Chris@10 844 TM = W[10];
Chris@10 845 TO = W[11];
Chris@10 846 TQ = FMA(TM, TN, TO * TP);
Chris@10 847 T2g = FNMS(TO, TN, TM * TP);
Chris@10 848 }
Chris@10 849 TR = TL + TQ;
Chris@10 850 T3q = T2f + T2g;
Chris@10 851 T2h = T2f - T2g;
Chris@10 852 T2w = TL - TQ;
Chris@10 853 }
Chris@10 854 TG = Tu - TF;
Chris@10 855 T13 = TR - T12;
Chris@10 856 T14 = TG + T13;
Chris@10 857 T3p = T3n - T3o;
Chris@10 858 T3s = T3q - T3r;
Chris@10 859 T43 = T3p + T3s;
Chris@10 860 T3A = T3n + T3o;
Chris@10 861 T3B = T3q + T3r;
Chris@10 862 T3J = T3A + T3B;
Chris@10 863 T1S = Tu + TF;
Chris@10 864 T1T = TR + T12;
Chris@10 865 T1U = T1S + T1T;
Chris@10 866 T2e = T2c + T2d;
Chris@10 867 T2j = T2h + T2i;
Chris@10 868 T4q = T2e + T2j;
Chris@10 869 T2T = T2c - T2d;
Chris@10 870 T2U = T2h - T2i;
Chris@10 871 T4b = T2T + T2U;
Chris@10 872 T30 = T2r + T2u;
Chris@10 873 T31 = T2w + T2z;
Chris@10 874 T32 = T30 + T31;
Chris@10 875 T2v = T2r - T2u;
Chris@10 876 T2A = T2w - T2z;
Chris@10 877 T2B = T2v + T2A;
Chris@10 878 }
Chris@10 879 {
Chris@10 880 E T3e, T1Q, T3d, T3u, T3w, T3m, T3t, T3v, T3f;
Chris@10 881 T3e = KP559016994 * (T14 - T1P);
Chris@10 882 T1Q = T14 + T1P;
Chris@10 883 T3d = FNMS(KP250000000, T1Q, Tj);
Chris@10 884 T3m = T3i - T3l;
Chris@10 885 T3t = T3p - T3s;
Chris@10 886 T3u = FNMS(KP587785252, T3t, KP951056516 * T3m);
Chris@10 887 T3w = FMA(KP951056516, T3t, KP587785252 * T3m);
Chris@10 888 ri[WS(rs, 10)] = Tj + T1Q;
Chris@10 889 T3v = T3e + T3d;
Chris@10 890 ri[WS(rs, 14)] = T3v - T3w;
Chris@10 891 ri[WS(rs, 6)] = T3v + T3w;
Chris@10 892 T3f = T3d - T3e;
Chris@10 893 ri[WS(rs, 2)] = T3f - T3u;
Chris@10 894 ri[WS(rs, 18)] = T3f + T3u;
Chris@10 895 }
Chris@10 896 {
Chris@10 897 E T47, T45, T46, T41, T4a, T3Z, T40, T49, T48;
Chris@10 898 T47 = KP559016994 * (T43 - T44);
Chris@10 899 T45 = T43 + T44;
Chris@10 900 T46 = FNMS(KP250000000, T45, T42);
Chris@10 901 T3Z = T1r - T1O;
Chris@10 902 T40 = TG - T13;
Chris@10 903 T41 = FNMS(KP587785252, T40, KP951056516 * T3Z);
Chris@10 904 T4a = FMA(KP951056516, T40, KP587785252 * T3Z);
Chris@10 905 ii[WS(rs, 10)] = T45 + T42;
Chris@10 906 T49 = T47 + T46;
Chris@10 907 ii[WS(rs, 6)] = T49 - T4a;
Chris@10 908 ii[WS(rs, 14)] = T4a + T49;
Chris@10 909 T48 = T46 - T47;
Chris@10 910 ii[WS(rs, 2)] = T41 + T48;
Chris@10 911 ii[WS(rs, 18)] = T48 - T41;
Chris@10 912 }
Chris@10 913 {
Chris@10 914 E T3x, T1Y, T3y, T3G, T3I, T3C, T3F, T3H, T3z;
Chris@10 915 T3x = KP559016994 * (T1U - T1X);
Chris@10 916 T1Y = T1U + T1X;
Chris@10 917 T3y = FNMS(KP250000000, T1Y, T1R);
Chris@10 918 T3C = T3A - T3B;
Chris@10 919 T3F = T3D - T3E;
Chris@10 920 T3G = FMA(KP951056516, T3C, KP587785252 * T3F);
Chris@10 921 T3I = FNMS(KP587785252, T3C, KP951056516 * T3F);
Chris@10 922 ri[0] = T1R + T1Y;
Chris@10 923 T3H = T3y - T3x;
Chris@10 924 ri[WS(rs, 12)] = T3H - T3I;
Chris@10 925 ri[WS(rs, 8)] = T3H + T3I;
Chris@10 926 T3z = T3x + T3y;
Chris@10 927 ri[WS(rs, 4)] = T3z - T3G;
Chris@10 928 ri[WS(rs, 16)] = T3z + T3G;
Chris@10 929 }
Chris@10 930 {
Chris@10 931 E T3U, T3L, T3V, T3T, T3Y, T3R, T3S, T3X, T3W;
Chris@10 932 T3U = KP559016994 * (T3J - T3K);
Chris@10 933 T3L = T3J + T3K;
Chris@10 934 T3V = FNMS(KP250000000, T3L, T3Q);
Chris@10 935 T3R = T1S - T1T;
Chris@10 936 T3S = T1V - T1W;
Chris@10 937 T3T = FMA(KP951056516, T3R, KP587785252 * T3S);
Chris@10 938 T3Y = FNMS(KP587785252, T3R, KP951056516 * T3S);
Chris@10 939 ii[0] = T3L + T3Q;
Chris@10 940 T3X = T3V - T3U;
Chris@10 941 ii[WS(rs, 8)] = T3X - T3Y;
Chris@10 942 ii[WS(rs, 12)] = T3Y + T3X;
Chris@10 943 T3W = T3U + T3V;
Chris@10 944 ii[WS(rs, 4)] = T3T + T3W;
Chris@10 945 ii[WS(rs, 16)] = T3W - T3T;
Chris@10 946 }
Chris@10 947 {
Chris@10 948 E T2P, T2N, T2O, T2l, T2R, T29, T2k, T2S, T2Q;
Chris@10 949 T2P = KP559016994 * (T2B - T2M);
Chris@10 950 T2N = T2B + T2M;
Chris@10 951 T2O = FNMS(KP250000000, T2N, T2q);
Chris@10 952 T29 = T23 - T28;
Chris@10 953 T2k = T2e - T2j;
Chris@10 954 T2l = FNMS(KP587785252, T2k, KP951056516 * T29);
Chris@10 955 T2R = FMA(KP951056516, T2k, KP587785252 * T29);
Chris@10 956 ri[WS(rs, 15)] = T2q + T2N;
Chris@10 957 T2S = T2P + T2O;
Chris@10 958 ri[WS(rs, 11)] = T2R + T2S;
Chris@10 959 ri[WS(rs, 19)] = T2S - T2R;
Chris@10 960 T2Q = T2O - T2P;
Chris@10 961 ri[WS(rs, 3)] = T2l + T2Q;
Chris@10 962 ri[WS(rs, 7)] = T2Q - T2l;
Chris@10 963 }
Chris@10 964 {
Chris@10 965 E T4u, T4s, T4t, T4y, T4A, T4w, T4x, T4z, T4v;
Chris@10 966 T4u = KP559016994 * (T4q - T4r);
Chris@10 967 T4s = T4q + T4r;
Chris@10 968 T4t = FNMS(KP250000000, T4s, T4p);
Chris@10 969 T4w = T2G - T2L;
Chris@10 970 T4x = T2v - T2A;
Chris@10 971 T4y = FNMS(KP587785252, T4x, KP951056516 * T4w);
Chris@10 972 T4A = FMA(KP951056516, T4x, KP587785252 * T4w);
Chris@10 973 ii[WS(rs, 15)] = T4s + T4p;
Chris@10 974 T4z = T4u + T4t;
Chris@10 975 ii[WS(rs, 11)] = T4z - T4A;
Chris@10 976 ii[WS(rs, 19)] = T4A + T4z;
Chris@10 977 T4v = T4t - T4u;
Chris@10 978 ii[WS(rs, 3)] = T4v - T4y;
Chris@10 979 ii[WS(rs, 7)] = T4y + T4v;
Chris@10 980 }
Chris@10 981 {
Chris@10 982 E T36, T38, T39, T2Z, T3b, T2V, T2Y, T3c, T3a;
Chris@10 983 T36 = KP559016994 * (T32 - T35);
Chris@10 984 T38 = T32 + T35;
Chris@10 985 T39 = FNMS(KP250000000, T38, T37);
Chris@10 986 T2V = T2T - T2U;
Chris@10 987 T2Y = T2W - T2X;
Chris@10 988 T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y);
Chris@10 989 T3b = FNMS(KP587785252, T2V, KP951056516 * T2Y);
Chris@10 990 ri[WS(rs, 5)] = T37 + T38;
Chris@10 991 T3c = T39 - T36;
Chris@10 992 ri[WS(rs, 13)] = T3b + T3c;
Chris@10 993 ri[WS(rs, 17)] = T3c - T3b;
Chris@10 994 T3a = T36 + T39;
Chris@10 995 ri[WS(rs, 1)] = T2Z + T3a;
Chris@10 996 ri[WS(rs, 9)] = T3a - T2Z;
Chris@10 997 }
Chris@10 998 {
Chris@10 999 E T4d, T4h, T4i, T4m, T4o, T4k, T4l, T4n, T4j;
Chris@10 1000 T4d = KP559016994 * (T4b - T4c);
Chris@10 1001 T4h = T4b + T4c;
Chris@10 1002 T4i = FNMS(KP250000000, T4h, T4g);
Chris@10 1003 T4k = T30 - T31;
Chris@10 1004 T4l = T33 - T34;
Chris@10 1005 T4m = FMA(KP951056516, T4k, KP587785252 * T4l);
Chris@10 1006 T4o = FNMS(KP587785252, T4k, KP951056516 * T4l);
Chris@10 1007 ii[WS(rs, 5)] = T4h + T4g;
Chris@10 1008 T4n = T4i - T4d;
Chris@10 1009 ii[WS(rs, 13)] = T4n - T4o;
Chris@10 1010 ii[WS(rs, 17)] = T4o + T4n;
Chris@10 1011 T4j = T4d + T4i;
Chris@10 1012 ii[WS(rs, 1)] = T4j - T4m;
Chris@10 1013 ii[WS(rs, 9)] = T4m + T4j;
Chris@10 1014 }
Chris@10 1015 }
Chris@10 1016 }
Chris@10 1017 }
Chris@10 1018
Chris@10 1019 static const tw_instr twinstr[] = {
Chris@10 1020 {TW_FULL, 0, 20},
Chris@10 1021 {TW_NEXT, 1, 0}
Chris@10 1022 };
Chris@10 1023
Chris@10 1024 static const ct_desc desc = { 20, "t1_20", twinstr, &GENUS, {184, 62, 62, 0}, 0, 0, 0 };
Chris@10 1025
Chris@10 1026 void X(codelet_t1_20) (planner *p) {
Chris@10 1027 X(kdft_dit_register) (p, t1_20, &desc);
Chris@10 1028 }
Chris@10 1029 #endif /* HAVE_FMA */