annotate src/fftw-3.3.8/dft/scalar/codelets/t1_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:15 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name t1_20 -include dft/scalar/t.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 246 FP additions, 148 FP multiplications,
Chris@82 32 * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
Chris@82 33 * 61 stack variables, 4 constants, and 80 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/t.h"
Chris@82 36
Chris@82 37 static void t1_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 for (m = mb, W = W + (mb * 38); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 46 E T8, T4N, T2i, T4r, Tl, T4O, T2n, T4n, TN, T2b, T40, T4b, T2v, T3v, T3i;
Chris@82 47 E T3F, T27, T2f, T3W, T4f, T2R, T3z, T3a, T3J, T1G, T2e, T3T, T4e, T2K, T3y;
Chris@82 48 E T33, T3I, T1e, T2c, T43, T4c, T2C, T3w, T3p, T3G;
Chris@82 49 {
Chris@82 50 E T1, T4q, T3, T6, T4, T4o, T2, T7, T4p, T5;
Chris@82 51 T1 = ri[0];
Chris@82 52 T4q = ii[0];
Chris@82 53 T3 = ri[WS(rs, 10)];
Chris@82 54 T6 = ii[WS(rs, 10)];
Chris@82 55 T2 = W[18];
Chris@82 56 T4 = T2 * T3;
Chris@82 57 T4o = T2 * T6;
Chris@82 58 T5 = W[19];
Chris@82 59 T7 = FMA(T5, T6, T4);
Chris@82 60 T4p = FNMS(T5, T3, T4o);
Chris@82 61 T8 = T1 + T7;
Chris@82 62 T4N = T4q - T4p;
Chris@82 63 T2i = T1 - T7;
Chris@82 64 T4r = T4p + T4q;
Chris@82 65 }
Chris@82 66 {
Chris@82 67 E Ta, Td, Tb, T2j, Tg, Tj, Th, T2l, T9, Tf;
Chris@82 68 Ta = ri[WS(rs, 5)];
Chris@82 69 Td = ii[WS(rs, 5)];
Chris@82 70 T9 = W[8];
Chris@82 71 Tb = T9 * Ta;
Chris@82 72 T2j = T9 * Td;
Chris@82 73 Tg = ri[WS(rs, 15)];
Chris@82 74 Tj = ii[WS(rs, 15)];
Chris@82 75 Tf = W[28];
Chris@82 76 Th = Tf * Tg;
Chris@82 77 T2l = Tf * Tj;
Chris@82 78 {
Chris@82 79 E Te, T2k, Tk, T2m, Tc, Ti;
Chris@82 80 Tc = W[9];
Chris@82 81 Te = FMA(Tc, Td, Tb);
Chris@82 82 T2k = FNMS(Tc, Ta, T2j);
Chris@82 83 Ti = W[29];
Chris@82 84 Tk = FMA(Ti, Tj, Th);
Chris@82 85 T2m = FNMS(Ti, Tg, T2l);
Chris@82 86 Tl = Te + Tk;
Chris@82 87 T4O = Te - Tk;
Chris@82 88 T2n = T2k - T2m;
Chris@82 89 T4n = T2k + T2m;
Chris@82 90 }
Chris@82 91 }
Chris@82 92 {
Chris@82 93 E Ts, T3d, TL, T2t, Ty, T3f, TF, T2r;
Chris@82 94 {
Chris@82 95 E To, Tr, Tp, T3c, Tn, Tq;
Chris@82 96 To = ri[WS(rs, 4)];
Chris@82 97 Tr = ii[WS(rs, 4)];
Chris@82 98 Tn = W[6];
Chris@82 99 Tp = Tn * To;
Chris@82 100 T3c = Tn * Tr;
Chris@82 101 Tq = W[7];
Chris@82 102 Ts = FMA(Tq, Tr, Tp);
Chris@82 103 T3d = FNMS(Tq, To, T3c);
Chris@82 104 }
Chris@82 105 {
Chris@82 106 E TH, TK, TI, T2s, TG, TJ;
Chris@82 107 TH = ri[WS(rs, 19)];
Chris@82 108 TK = ii[WS(rs, 19)];
Chris@82 109 TG = W[36];
Chris@82 110 TI = TG * TH;
Chris@82 111 T2s = TG * TK;
Chris@82 112 TJ = W[37];
Chris@82 113 TL = FMA(TJ, TK, TI);
Chris@82 114 T2t = FNMS(TJ, TH, T2s);
Chris@82 115 }
Chris@82 116 {
Chris@82 117 E Tu, Tx, Tv, T3e, Tt, Tw;
Chris@82 118 Tu = ri[WS(rs, 14)];
Chris@82 119 Tx = ii[WS(rs, 14)];
Chris@82 120 Tt = W[26];
Chris@82 121 Tv = Tt * Tu;
Chris@82 122 T3e = Tt * Tx;
Chris@82 123 Tw = W[27];
Chris@82 124 Ty = FMA(Tw, Tx, Tv);
Chris@82 125 T3f = FNMS(Tw, Tu, T3e);
Chris@82 126 }
Chris@82 127 {
Chris@82 128 E TB, TE, TC, T2q, TA, TD;
Chris@82 129 TB = ri[WS(rs, 9)];
Chris@82 130 TE = ii[WS(rs, 9)];
Chris@82 131 TA = W[16];
Chris@82 132 TC = TA * TB;
Chris@82 133 T2q = TA * TE;
Chris@82 134 TD = W[17];
Chris@82 135 TF = FMA(TD, TE, TC);
Chris@82 136 T2r = FNMS(TD, TB, T2q);
Chris@82 137 }
Chris@82 138 {
Chris@82 139 E Tz, TM, T3Y, T3Z;
Chris@82 140 Tz = Ts + Ty;
Chris@82 141 TM = TF + TL;
Chris@82 142 TN = Tz - TM;
Chris@82 143 T2b = Tz + TM;
Chris@82 144 T3Y = T3d + T3f;
Chris@82 145 T3Z = T2r + T2t;
Chris@82 146 T40 = T3Y - T3Z;
Chris@82 147 T4b = T3Y + T3Z;
Chris@82 148 }
Chris@82 149 {
Chris@82 150 E T2p, T2u, T3g, T3h;
Chris@82 151 T2p = Ts - Ty;
Chris@82 152 T2u = T2r - T2t;
Chris@82 153 T2v = T2p - T2u;
Chris@82 154 T3v = T2p + T2u;
Chris@82 155 T3g = T3d - T3f;
Chris@82 156 T3h = TF - TL;
Chris@82 157 T3i = T3g + T3h;
Chris@82 158 T3F = T3g - T3h;
Chris@82 159 }
Chris@82 160 }
Chris@82 161 {
Chris@82 162 E T1M, T35, T25, T2P, T1S, T37, T1Z, T2N;
Chris@82 163 {
Chris@82 164 E T1I, T1L, T1J, T34, T1H, T1K;
Chris@82 165 T1I = ri[WS(rs, 12)];
Chris@82 166 T1L = ii[WS(rs, 12)];
Chris@82 167 T1H = W[22];
Chris@82 168 T1J = T1H * T1I;
Chris@82 169 T34 = T1H * T1L;
Chris@82 170 T1K = W[23];
Chris@82 171 T1M = FMA(T1K, T1L, T1J);
Chris@82 172 T35 = FNMS(T1K, T1I, T34);
Chris@82 173 }
Chris@82 174 {
Chris@82 175 E T21, T24, T22, T2O, T20, T23;
Chris@82 176 T21 = ri[WS(rs, 7)];
Chris@82 177 T24 = ii[WS(rs, 7)];
Chris@82 178 T20 = W[12];
Chris@82 179 T22 = T20 * T21;
Chris@82 180 T2O = T20 * T24;
Chris@82 181 T23 = W[13];
Chris@82 182 T25 = FMA(T23, T24, T22);
Chris@82 183 T2P = FNMS(T23, T21, T2O);
Chris@82 184 }
Chris@82 185 {
Chris@82 186 E T1O, T1R, T1P, T36, T1N, T1Q;
Chris@82 187 T1O = ri[WS(rs, 2)];
Chris@82 188 T1R = ii[WS(rs, 2)];
Chris@82 189 T1N = W[2];
Chris@82 190 T1P = T1N * T1O;
Chris@82 191 T36 = T1N * T1R;
Chris@82 192 T1Q = W[3];
Chris@82 193 T1S = FMA(T1Q, T1R, T1P);
Chris@82 194 T37 = FNMS(T1Q, T1O, T36);
Chris@82 195 }
Chris@82 196 {
Chris@82 197 E T1V, T1Y, T1W, T2M, T1U, T1X;
Chris@82 198 T1V = ri[WS(rs, 17)];
Chris@82 199 T1Y = ii[WS(rs, 17)];
Chris@82 200 T1U = W[32];
Chris@82 201 T1W = T1U * T1V;
Chris@82 202 T2M = T1U * T1Y;
Chris@82 203 T1X = W[33];
Chris@82 204 T1Z = FMA(T1X, T1Y, T1W);
Chris@82 205 T2N = FNMS(T1X, T1V, T2M);
Chris@82 206 }
Chris@82 207 {
Chris@82 208 E T1T, T26, T3U, T3V;
Chris@82 209 T1T = T1M + T1S;
Chris@82 210 T26 = T1Z + T25;
Chris@82 211 T27 = T1T - T26;
Chris@82 212 T2f = T1T + T26;
Chris@82 213 T3U = T35 + T37;
Chris@82 214 T3V = T2N + T2P;
Chris@82 215 T3W = T3U - T3V;
Chris@82 216 T4f = T3U + T3V;
Chris@82 217 }
Chris@82 218 {
Chris@82 219 E T2L, T2Q, T38, T39;
Chris@82 220 T2L = T1M - T1S;
Chris@82 221 T2Q = T2N - T2P;
Chris@82 222 T2R = T2L - T2Q;
Chris@82 223 T3z = T2L + T2Q;
Chris@82 224 T38 = T35 - T37;
Chris@82 225 T39 = T1Z - T25;
Chris@82 226 T3a = T38 + T39;
Chris@82 227 T3J = T38 - T39;
Chris@82 228 }
Chris@82 229 }
Chris@82 230 {
Chris@82 231 E T1l, T2Y, T1E, T2I, T1r, T30, T1y, T2G;
Chris@82 232 {
Chris@82 233 E T1h, T1k, T1i, T2X, T1g, T1j;
Chris@82 234 T1h = ri[WS(rs, 8)];
Chris@82 235 T1k = ii[WS(rs, 8)];
Chris@82 236 T1g = W[14];
Chris@82 237 T1i = T1g * T1h;
Chris@82 238 T2X = T1g * T1k;
Chris@82 239 T1j = W[15];
Chris@82 240 T1l = FMA(T1j, T1k, T1i);
Chris@82 241 T2Y = FNMS(T1j, T1h, T2X);
Chris@82 242 }
Chris@82 243 {
Chris@82 244 E T1A, T1D, T1B, T2H, T1z, T1C;
Chris@82 245 T1A = ri[WS(rs, 3)];
Chris@82 246 T1D = ii[WS(rs, 3)];
Chris@82 247 T1z = W[4];
Chris@82 248 T1B = T1z * T1A;
Chris@82 249 T2H = T1z * T1D;
Chris@82 250 T1C = W[5];
Chris@82 251 T1E = FMA(T1C, T1D, T1B);
Chris@82 252 T2I = FNMS(T1C, T1A, T2H);
Chris@82 253 }
Chris@82 254 {
Chris@82 255 E T1n, T1q, T1o, T2Z, T1m, T1p;
Chris@82 256 T1n = ri[WS(rs, 18)];
Chris@82 257 T1q = ii[WS(rs, 18)];
Chris@82 258 T1m = W[34];
Chris@82 259 T1o = T1m * T1n;
Chris@82 260 T2Z = T1m * T1q;
Chris@82 261 T1p = W[35];
Chris@82 262 T1r = FMA(T1p, T1q, T1o);
Chris@82 263 T30 = FNMS(T1p, T1n, T2Z);
Chris@82 264 }
Chris@82 265 {
Chris@82 266 E T1u, T1x, T1v, T2F, T1t, T1w;
Chris@82 267 T1u = ri[WS(rs, 13)];
Chris@82 268 T1x = ii[WS(rs, 13)];
Chris@82 269 T1t = W[24];
Chris@82 270 T1v = T1t * T1u;
Chris@82 271 T2F = T1t * T1x;
Chris@82 272 T1w = W[25];
Chris@82 273 T1y = FMA(T1w, T1x, T1v);
Chris@82 274 T2G = FNMS(T1w, T1u, T2F);
Chris@82 275 }
Chris@82 276 {
Chris@82 277 E T1s, T1F, T3R, T3S;
Chris@82 278 T1s = T1l + T1r;
Chris@82 279 T1F = T1y + T1E;
Chris@82 280 T1G = T1s - T1F;
Chris@82 281 T2e = T1s + T1F;
Chris@82 282 T3R = T2Y + T30;
Chris@82 283 T3S = T2G + T2I;
Chris@82 284 T3T = T3R - T3S;
Chris@82 285 T4e = T3R + T3S;
Chris@82 286 }
Chris@82 287 {
Chris@82 288 E T2E, T2J, T31, T32;
Chris@82 289 T2E = T1l - T1r;
Chris@82 290 T2J = T2G - T2I;
Chris@82 291 T2K = T2E - T2J;
Chris@82 292 T3y = T2E + T2J;
Chris@82 293 T31 = T2Y - T30;
Chris@82 294 T32 = T1y - T1E;
Chris@82 295 T33 = T31 + T32;
Chris@82 296 T3I = T31 - T32;
Chris@82 297 }
Chris@82 298 }
Chris@82 299 {
Chris@82 300 E TT, T3k, T1c, T2A, TZ, T3m, T16, T2y;
Chris@82 301 {
Chris@82 302 E TP, TS, TQ, T3j, TO, TR;
Chris@82 303 TP = ri[WS(rs, 16)];
Chris@82 304 TS = ii[WS(rs, 16)];
Chris@82 305 TO = W[30];
Chris@82 306 TQ = TO * TP;
Chris@82 307 T3j = TO * TS;
Chris@82 308 TR = W[31];
Chris@82 309 TT = FMA(TR, TS, TQ);
Chris@82 310 T3k = FNMS(TR, TP, T3j);
Chris@82 311 }
Chris@82 312 {
Chris@82 313 E T18, T1b, T19, T2z, T17, T1a;
Chris@82 314 T18 = ri[WS(rs, 11)];
Chris@82 315 T1b = ii[WS(rs, 11)];
Chris@82 316 T17 = W[20];
Chris@82 317 T19 = T17 * T18;
Chris@82 318 T2z = T17 * T1b;
Chris@82 319 T1a = W[21];
Chris@82 320 T1c = FMA(T1a, T1b, T19);
Chris@82 321 T2A = FNMS(T1a, T18, T2z);
Chris@82 322 }
Chris@82 323 {
Chris@82 324 E TV, TY, TW, T3l, TU, TX;
Chris@82 325 TV = ri[WS(rs, 6)];
Chris@82 326 TY = ii[WS(rs, 6)];
Chris@82 327 TU = W[10];
Chris@82 328 TW = TU * TV;
Chris@82 329 T3l = TU * TY;
Chris@82 330 TX = W[11];
Chris@82 331 TZ = FMA(TX, TY, TW);
Chris@82 332 T3m = FNMS(TX, TV, T3l);
Chris@82 333 }
Chris@82 334 {
Chris@82 335 E T12, T15, T13, T2x, T11, T14;
Chris@82 336 T12 = ri[WS(rs, 1)];
Chris@82 337 T15 = ii[WS(rs, 1)];
Chris@82 338 T11 = W[0];
Chris@82 339 T13 = T11 * T12;
Chris@82 340 T2x = T11 * T15;
Chris@82 341 T14 = W[1];
Chris@82 342 T16 = FMA(T14, T15, T13);
Chris@82 343 T2y = FNMS(T14, T12, T2x);
Chris@82 344 }
Chris@82 345 {
Chris@82 346 E T10, T1d, T41, T42;
Chris@82 347 T10 = TT + TZ;
Chris@82 348 T1d = T16 + T1c;
Chris@82 349 T1e = T10 - T1d;
Chris@82 350 T2c = T10 + T1d;
Chris@82 351 T41 = T3k + T3m;
Chris@82 352 T42 = T2y + T2A;
Chris@82 353 T43 = T41 - T42;
Chris@82 354 T4c = T41 + T42;
Chris@82 355 }
Chris@82 356 {
Chris@82 357 E T2w, T2B, T3n, T3o;
Chris@82 358 T2w = TT - TZ;
Chris@82 359 T2B = T2y - T2A;
Chris@82 360 T2C = T2w - T2B;
Chris@82 361 T3w = T2w + T2B;
Chris@82 362 T3n = T3k - T3m;
Chris@82 363 T3o = T16 - T1c;
Chris@82 364 T3p = T3n + T3o;
Chris@82 365 T3G = T3n - T3o;
Chris@82 366 }
Chris@82 367 }
Chris@82 368 {
Chris@82 369 E T45, T47, Tm, T29, T3O, T3P, T46, T3Q;
Chris@82 370 {
Chris@82 371 E T3X, T44, T1f, T28;
Chris@82 372 T3X = T3T - T3W;
Chris@82 373 T44 = T40 - T43;
Chris@82 374 T45 = FNMS(KP618033988, T44, T3X);
Chris@82 375 T47 = FMA(KP618033988, T3X, T44);
Chris@82 376 Tm = T8 - Tl;
Chris@82 377 T1f = TN + T1e;
Chris@82 378 T28 = T1G + T27;
Chris@82 379 T29 = T1f + T28;
Chris@82 380 T3O = FNMS(KP250000000, T29, Tm);
Chris@82 381 T3P = T1f - T28;
Chris@82 382 }
Chris@82 383 ri[WS(rs, 10)] = Tm + T29;
Chris@82 384 T46 = FMA(KP559016994, T3P, T3O);
Chris@82 385 ri[WS(rs, 14)] = FNMS(KP951056516, T47, T46);
Chris@82 386 ri[WS(rs, 6)] = FMA(KP951056516, T47, T46);
Chris@82 387 T3Q = FNMS(KP559016994, T3P, T3O);
Chris@82 388 ri[WS(rs, 2)] = FNMS(KP951056516, T45, T3Q);
Chris@82 389 ri[WS(rs, 18)] = FMA(KP951056516, T45, T3Q);
Chris@82 390 }
Chris@82 391 {
Chris@82 392 E T4K, T4M, T4B, T4E, T4F, T4G, T4L, T4H;
Chris@82 393 {
Chris@82 394 E T4I, T4J, T4C, T4D;
Chris@82 395 T4I = T1G - T27;
Chris@82 396 T4J = TN - T1e;
Chris@82 397 T4K = FNMS(KP618033988, T4J, T4I);
Chris@82 398 T4M = FMA(KP618033988, T4I, T4J);
Chris@82 399 T4B = T4r - T4n;
Chris@82 400 T4C = T40 + T43;
Chris@82 401 T4D = T3T + T3W;
Chris@82 402 T4E = T4C + T4D;
Chris@82 403 T4F = FNMS(KP250000000, T4E, T4B);
Chris@82 404 T4G = T4C - T4D;
Chris@82 405 }
Chris@82 406 ii[WS(rs, 10)] = T4E + T4B;
Chris@82 407 T4L = FMA(KP559016994, T4G, T4F);
Chris@82 408 ii[WS(rs, 6)] = FNMS(KP951056516, T4M, T4L);
Chris@82 409 ii[WS(rs, 14)] = FMA(KP951056516, T4M, T4L);
Chris@82 410 T4H = FNMS(KP559016994, T4G, T4F);
Chris@82 411 ii[WS(rs, 2)] = FMA(KP951056516, T4K, T4H);
Chris@82 412 ii[WS(rs, 18)] = FNMS(KP951056516, T4K, T4H);
Chris@82 413 }
Chris@82 414 {
Chris@82 415 E T4h, T4j, T2a, T2h, T48, T49, T4i, T4a;
Chris@82 416 {
Chris@82 417 E T4d, T4g, T2d, T2g;
Chris@82 418 T4d = T4b - T4c;
Chris@82 419 T4g = T4e - T4f;
Chris@82 420 T4h = FMA(KP618033988, T4g, T4d);
Chris@82 421 T4j = FNMS(KP618033988, T4d, T4g);
Chris@82 422 T2a = T8 + Tl;
Chris@82 423 T2d = T2b + T2c;
Chris@82 424 T2g = T2e + T2f;
Chris@82 425 T2h = T2d + T2g;
Chris@82 426 T48 = FNMS(KP250000000, T2h, T2a);
Chris@82 427 T49 = T2d - T2g;
Chris@82 428 }
Chris@82 429 ri[0] = T2a + T2h;
Chris@82 430 T4i = FNMS(KP559016994, T49, T48);
Chris@82 431 ri[WS(rs, 12)] = FNMS(KP951056516, T4j, T4i);
Chris@82 432 ri[WS(rs, 8)] = FMA(KP951056516, T4j, T4i);
Chris@82 433 T4a = FMA(KP559016994, T49, T48);
Chris@82 434 ri[WS(rs, 4)] = FNMS(KP951056516, T4h, T4a);
Chris@82 435 ri[WS(rs, 16)] = FMA(KP951056516, T4h, T4a);
Chris@82 436 }
Chris@82 437 {
Chris@82 438 E T4y, T4A, T4s, T4m, T4t, T4u, T4z, T4v;
Chris@82 439 {
Chris@82 440 E T4w, T4x, T4k, T4l;
Chris@82 441 T4w = T2b - T2c;
Chris@82 442 T4x = T2e - T2f;
Chris@82 443 T4y = FMA(KP618033988, T4x, T4w);
Chris@82 444 T4A = FNMS(KP618033988, T4w, T4x);
Chris@82 445 T4s = T4n + T4r;
Chris@82 446 T4k = T4b + T4c;
Chris@82 447 T4l = T4e + T4f;
Chris@82 448 T4m = T4k + T4l;
Chris@82 449 T4t = FNMS(KP250000000, T4m, T4s);
Chris@82 450 T4u = T4k - T4l;
Chris@82 451 }
Chris@82 452 ii[0] = T4m + T4s;
Chris@82 453 T4z = FNMS(KP559016994, T4u, T4t);
Chris@82 454 ii[WS(rs, 8)] = FNMS(KP951056516, T4A, T4z);
Chris@82 455 ii[WS(rs, 12)] = FMA(KP951056516, T4A, T4z);
Chris@82 456 T4v = FMA(KP559016994, T4u, T4t);
Chris@82 457 ii[WS(rs, 4)] = FMA(KP951056516, T4y, T4v);
Chris@82 458 ii[WS(rs, 16)] = FNMS(KP951056516, T4y, T4v);
Chris@82 459 }
Chris@82 460 {
Chris@82 461 E T3r, T3t, T2o, T2T, T2U, T2V, T3s, T2W;
Chris@82 462 {
Chris@82 463 E T3b, T3q, T2D, T2S;
Chris@82 464 T3b = T33 - T3a;
Chris@82 465 T3q = T3i - T3p;
Chris@82 466 T3r = FNMS(KP618033988, T3q, T3b);
Chris@82 467 T3t = FMA(KP618033988, T3b, T3q);
Chris@82 468 T2o = T2i - T2n;
Chris@82 469 T2D = T2v + T2C;
Chris@82 470 T2S = T2K + T2R;
Chris@82 471 T2T = T2D + T2S;
Chris@82 472 T2U = FNMS(KP250000000, T2T, T2o);
Chris@82 473 T2V = T2D - T2S;
Chris@82 474 }
Chris@82 475 ri[WS(rs, 15)] = T2o + T2T;
Chris@82 476 T3s = FMA(KP559016994, T2V, T2U);
Chris@82 477 ri[WS(rs, 11)] = FMA(KP951056516, T3t, T3s);
Chris@82 478 ri[WS(rs, 19)] = FNMS(KP951056516, T3t, T3s);
Chris@82 479 T2W = FNMS(KP559016994, T2V, T2U);
Chris@82 480 ri[WS(rs, 3)] = FMA(KP951056516, T3r, T2W);
Chris@82 481 ri[WS(rs, 7)] = FNMS(KP951056516, T3r, T2W);
Chris@82 482 }
Chris@82 483 {
Chris@82 484 E T5a, T5c, T51, T54, T55, T56, T5b, T57;
Chris@82 485 {
Chris@82 486 E T58, T59, T52, T53;
Chris@82 487 T58 = T2K - T2R;
Chris@82 488 T59 = T2v - T2C;
Chris@82 489 T5a = FNMS(KP618033988, T59, T58);
Chris@82 490 T5c = FMA(KP618033988, T58, T59);
Chris@82 491 T51 = T4O + T4N;
Chris@82 492 T52 = T3i + T3p;
Chris@82 493 T53 = T33 + T3a;
Chris@82 494 T54 = T52 + T53;
Chris@82 495 T55 = FNMS(KP250000000, T54, T51);
Chris@82 496 T56 = T52 - T53;
Chris@82 497 }
Chris@82 498 ii[WS(rs, 15)] = T54 + T51;
Chris@82 499 T5b = FMA(KP559016994, T56, T55);
Chris@82 500 ii[WS(rs, 11)] = FNMS(KP951056516, T5c, T5b);
Chris@82 501 ii[WS(rs, 19)] = FMA(KP951056516, T5c, T5b);
Chris@82 502 T57 = FNMS(KP559016994, T56, T55);
Chris@82 503 ii[WS(rs, 3)] = FNMS(KP951056516, T5a, T57);
Chris@82 504 ii[WS(rs, 7)] = FMA(KP951056516, T5a, T57);
Chris@82 505 }
Chris@82 506 {
Chris@82 507 E T3L, T3N, T3u, T3B, T3C, T3D, T3M, T3E;
Chris@82 508 {
Chris@82 509 E T3H, T3K, T3x, T3A;
Chris@82 510 T3H = T3F - T3G;
Chris@82 511 T3K = T3I - T3J;
Chris@82 512 T3L = FMA(KP618033988, T3K, T3H);
Chris@82 513 T3N = FNMS(KP618033988, T3H, T3K);
Chris@82 514 T3u = T2i + T2n;
Chris@82 515 T3x = T3v + T3w;
Chris@82 516 T3A = T3y + T3z;
Chris@82 517 T3B = T3x + T3A;
Chris@82 518 T3C = FNMS(KP250000000, T3B, T3u);
Chris@82 519 T3D = T3x - T3A;
Chris@82 520 }
Chris@82 521 ri[WS(rs, 5)] = T3u + T3B;
Chris@82 522 T3M = FNMS(KP559016994, T3D, T3C);
Chris@82 523 ri[WS(rs, 13)] = FMA(KP951056516, T3N, T3M);
Chris@82 524 ri[WS(rs, 17)] = FNMS(KP951056516, T3N, T3M);
Chris@82 525 T3E = FMA(KP559016994, T3D, T3C);
Chris@82 526 ri[WS(rs, 1)] = FMA(KP951056516, T3L, T3E);
Chris@82 527 ri[WS(rs, 9)] = FNMS(KP951056516, T3L, T3E);
Chris@82 528 }
Chris@82 529 {
Chris@82 530 E T4Y, T50, T4P, T4S, T4T, T4U, T4Z, T4V;
Chris@82 531 {
Chris@82 532 E T4W, T4X, T4Q, T4R;
Chris@82 533 T4W = T3v - T3w;
Chris@82 534 T4X = T3y - T3z;
Chris@82 535 T4Y = FMA(KP618033988, T4X, T4W);
Chris@82 536 T50 = FNMS(KP618033988, T4W, T4X);
Chris@82 537 T4P = T4N - T4O;
Chris@82 538 T4Q = T3F + T3G;
Chris@82 539 T4R = T3I + T3J;
Chris@82 540 T4S = T4Q + T4R;
Chris@82 541 T4T = FNMS(KP250000000, T4S, T4P);
Chris@82 542 T4U = T4Q - T4R;
Chris@82 543 }
Chris@82 544 ii[WS(rs, 5)] = T4S + T4P;
Chris@82 545 T4Z = FNMS(KP559016994, T4U, T4T);
Chris@82 546 ii[WS(rs, 13)] = FNMS(KP951056516, T50, T4Z);
Chris@82 547 ii[WS(rs, 17)] = FMA(KP951056516, T50, T4Z);
Chris@82 548 T4V = FMA(KP559016994, T4U, T4T);
Chris@82 549 ii[WS(rs, 1)] = FNMS(KP951056516, T4Y, T4V);
Chris@82 550 ii[WS(rs, 9)] = FMA(KP951056516, T4Y, T4V);
Chris@82 551 }
Chris@82 552 }
Chris@82 553 }
Chris@82 554 }
Chris@82 555
Chris@82 556 static const tw_instr twinstr[] = {
Chris@82 557 {TW_FULL, 0, 20},
Chris@82 558 {TW_NEXT, 1, 0}
Chris@82 559 };
Chris@82 560
Chris@82 561 static const ct_desc desc = { 20, "t1_20", twinstr, &GENUS, {136, 38, 110, 0}, 0, 0, 0 };
Chris@82 562
Chris@82 563 void X(codelet_t1_20) (planner *p) {
Chris@82 564 X(kdft_dit_register) (p, t1_20, &desc);
Chris@82 565 }
Chris@82 566 #else
Chris@82 567
Chris@82 568 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 20 -name t1_20 -include dft/scalar/t.h */
Chris@82 569
Chris@82 570 /*
Chris@82 571 * This function contains 246 FP additions, 124 FP multiplications,
Chris@82 572 * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
Chris@82 573 * 85 stack variables, 4 constants, and 80 memory accesses
Chris@82 574 */
Chris@82 575 #include "dft/scalar/t.h"
Chris@82 576
Chris@82 577 static void t1_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 578 {
Chris@82 579 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 580 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 581 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 582 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 583 {
Chris@82 584 INT m;
Chris@82 585 for (m = mb, W = W + (mb * 38); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 586 E Tj, T1R, T4g, T4p, T2q, T37, T3Q, T42, T1r, T1O, T1P, T3i, T3l, T44, T3D;
Chris@82 587 E T3E, T3K, T1V, T1W, T1X, T23, T28, T4r, T2W, T2X, T4c, T33, T34, T35, T2G;
Chris@82 588 E T2L, T2M, TG, T13, T14, T3p, T3s, T43, T3A, T3B, T3J, T1S, T1T, T1U, T2e;
Chris@82 589 E T2j, T4q, T2T, T2U, T4b, T30, T31, T32, T2v, T2A, T2B;
Chris@82 590 {
Chris@82 591 E T1, T3O, T6, T3N, Tc, T2n, Th, T2o;
Chris@82 592 T1 = ri[0];
Chris@82 593 T3O = ii[0];
Chris@82 594 {
Chris@82 595 E T3, T5, T2, T4;
Chris@82 596 T3 = ri[WS(rs, 10)];
Chris@82 597 T5 = ii[WS(rs, 10)];
Chris@82 598 T2 = W[18];
Chris@82 599 T4 = W[19];
Chris@82 600 T6 = FMA(T2, T3, T4 * T5);
Chris@82 601 T3N = FNMS(T4, T3, T2 * T5);
Chris@82 602 }
Chris@82 603 {
Chris@82 604 E T9, Tb, T8, Ta;
Chris@82 605 T9 = ri[WS(rs, 5)];
Chris@82 606 Tb = ii[WS(rs, 5)];
Chris@82 607 T8 = W[8];
Chris@82 608 Ta = W[9];
Chris@82 609 Tc = FMA(T8, T9, Ta * Tb);
Chris@82 610 T2n = FNMS(Ta, T9, T8 * Tb);
Chris@82 611 }
Chris@82 612 {
Chris@82 613 E Te, Tg, Td, Tf;
Chris@82 614 Te = ri[WS(rs, 15)];
Chris@82 615 Tg = ii[WS(rs, 15)];
Chris@82 616 Td = W[28];
Chris@82 617 Tf = W[29];
Chris@82 618 Th = FMA(Td, Te, Tf * Tg);
Chris@82 619 T2o = FNMS(Tf, Te, Td * Tg);
Chris@82 620 }
Chris@82 621 {
Chris@82 622 E T7, Ti, T4e, T4f;
Chris@82 623 T7 = T1 + T6;
Chris@82 624 Ti = Tc + Th;
Chris@82 625 Tj = T7 - Ti;
Chris@82 626 T1R = T7 + Ti;
Chris@82 627 T4e = T3O - T3N;
Chris@82 628 T4f = Tc - Th;
Chris@82 629 T4g = T4e - T4f;
Chris@82 630 T4p = T4f + T4e;
Chris@82 631 }
Chris@82 632 {
Chris@82 633 E T2m, T2p, T3M, T3P;
Chris@82 634 T2m = T1 - T6;
Chris@82 635 T2p = T2n - T2o;
Chris@82 636 T2q = T2m - T2p;
Chris@82 637 T37 = T2m + T2p;
Chris@82 638 T3M = T2n + T2o;
Chris@82 639 T3P = T3N + T3O;
Chris@82 640 T3Q = T3M + T3P;
Chris@82 641 T42 = T3P - T3M;
Chris@82 642 }
Chris@82 643 }
Chris@82 644 {
Chris@82 645 E T1f, T3g, T21, T2C, T1N, T3k, T27, T2K, T1q, T3h, T22, T2F, T1C, T3j, T26;
Chris@82 646 E T2H;
Chris@82 647 {
Chris@82 648 E T19, T1Z, T1e, T20;
Chris@82 649 {
Chris@82 650 E T16, T18, T15, T17;
Chris@82 651 T16 = ri[WS(rs, 8)];
Chris@82 652 T18 = ii[WS(rs, 8)];
Chris@82 653 T15 = W[14];
Chris@82 654 T17 = W[15];
Chris@82 655 T19 = FMA(T15, T16, T17 * T18);
Chris@82 656 T1Z = FNMS(T17, T16, T15 * T18);
Chris@82 657 }
Chris@82 658 {
Chris@82 659 E T1b, T1d, T1a, T1c;
Chris@82 660 T1b = ri[WS(rs, 18)];
Chris@82 661 T1d = ii[WS(rs, 18)];
Chris@82 662 T1a = W[34];
Chris@82 663 T1c = W[35];
Chris@82 664 T1e = FMA(T1a, T1b, T1c * T1d);
Chris@82 665 T20 = FNMS(T1c, T1b, T1a * T1d);
Chris@82 666 }
Chris@82 667 T1f = T19 + T1e;
Chris@82 668 T3g = T1Z + T20;
Chris@82 669 T21 = T1Z - T20;
Chris@82 670 T2C = T19 - T1e;
Chris@82 671 }
Chris@82 672 {
Chris@82 673 E T1H, T2I, T1M, T2J;
Chris@82 674 {
Chris@82 675 E T1E, T1G, T1D, T1F;
Chris@82 676 T1E = ri[WS(rs, 17)];
Chris@82 677 T1G = ii[WS(rs, 17)];
Chris@82 678 T1D = W[32];
Chris@82 679 T1F = W[33];
Chris@82 680 T1H = FMA(T1D, T1E, T1F * T1G);
Chris@82 681 T2I = FNMS(T1F, T1E, T1D * T1G);
Chris@82 682 }
Chris@82 683 {
Chris@82 684 E T1J, T1L, T1I, T1K;
Chris@82 685 T1J = ri[WS(rs, 7)];
Chris@82 686 T1L = ii[WS(rs, 7)];
Chris@82 687 T1I = W[12];
Chris@82 688 T1K = W[13];
Chris@82 689 T1M = FMA(T1I, T1J, T1K * T1L);
Chris@82 690 T2J = FNMS(T1K, T1J, T1I * T1L);
Chris@82 691 }
Chris@82 692 T1N = T1H + T1M;
Chris@82 693 T3k = T2I + T2J;
Chris@82 694 T27 = T1H - T1M;
Chris@82 695 T2K = T2I - T2J;
Chris@82 696 }
Chris@82 697 {
Chris@82 698 E T1k, T2D, T1p, T2E;
Chris@82 699 {
Chris@82 700 E T1h, T1j, T1g, T1i;
Chris@82 701 T1h = ri[WS(rs, 13)];
Chris@82 702 T1j = ii[WS(rs, 13)];
Chris@82 703 T1g = W[24];
Chris@82 704 T1i = W[25];
Chris@82 705 T1k = FMA(T1g, T1h, T1i * T1j);
Chris@82 706 T2D = FNMS(T1i, T1h, T1g * T1j);
Chris@82 707 }
Chris@82 708 {
Chris@82 709 E T1m, T1o, T1l, T1n;
Chris@82 710 T1m = ri[WS(rs, 3)];
Chris@82 711 T1o = ii[WS(rs, 3)];
Chris@82 712 T1l = W[4];
Chris@82 713 T1n = W[5];
Chris@82 714 T1p = FMA(T1l, T1m, T1n * T1o);
Chris@82 715 T2E = FNMS(T1n, T1m, T1l * T1o);
Chris@82 716 }
Chris@82 717 T1q = T1k + T1p;
Chris@82 718 T3h = T2D + T2E;
Chris@82 719 T22 = T1k - T1p;
Chris@82 720 T2F = T2D - T2E;
Chris@82 721 }
Chris@82 722 {
Chris@82 723 E T1w, T24, T1B, T25;
Chris@82 724 {
Chris@82 725 E T1t, T1v, T1s, T1u;
Chris@82 726 T1t = ri[WS(rs, 12)];
Chris@82 727 T1v = ii[WS(rs, 12)];
Chris@82 728 T1s = W[22];
Chris@82 729 T1u = W[23];
Chris@82 730 T1w = FMA(T1s, T1t, T1u * T1v);
Chris@82 731 T24 = FNMS(T1u, T1t, T1s * T1v);
Chris@82 732 }
Chris@82 733 {
Chris@82 734 E T1y, T1A, T1x, T1z;
Chris@82 735 T1y = ri[WS(rs, 2)];
Chris@82 736 T1A = ii[WS(rs, 2)];
Chris@82 737 T1x = W[2];
Chris@82 738 T1z = W[3];
Chris@82 739 T1B = FMA(T1x, T1y, T1z * T1A);
Chris@82 740 T25 = FNMS(T1z, T1y, T1x * T1A);
Chris@82 741 }
Chris@82 742 T1C = T1w + T1B;
Chris@82 743 T3j = T24 + T25;
Chris@82 744 T26 = T24 - T25;
Chris@82 745 T2H = T1w - T1B;
Chris@82 746 }
Chris@82 747 T1r = T1f - T1q;
Chris@82 748 T1O = T1C - T1N;
Chris@82 749 T1P = T1r + T1O;
Chris@82 750 T3i = T3g - T3h;
Chris@82 751 T3l = T3j - T3k;
Chris@82 752 T44 = T3i + T3l;
Chris@82 753 T3D = T3g + T3h;
Chris@82 754 T3E = T3j + T3k;
Chris@82 755 T3K = T3D + T3E;
Chris@82 756 T1V = T1f + T1q;
Chris@82 757 T1W = T1C + T1N;
Chris@82 758 T1X = T1V + T1W;
Chris@82 759 T23 = T21 + T22;
Chris@82 760 T28 = T26 + T27;
Chris@82 761 T4r = T23 + T28;
Chris@82 762 T2W = T21 - T22;
Chris@82 763 T2X = T26 - T27;
Chris@82 764 T4c = T2W + T2X;
Chris@82 765 T33 = T2C + T2F;
Chris@82 766 T34 = T2H + T2K;
Chris@82 767 T35 = T33 + T34;
Chris@82 768 T2G = T2C - T2F;
Chris@82 769 T2L = T2H - T2K;
Chris@82 770 T2M = T2G + T2L;
Chris@82 771 }
Chris@82 772 {
Chris@82 773 E Tu, T3n, T2c, T2r, T12, T3r, T2i, T2z, TF, T3o, T2d, T2u, TR, T3q, T2h;
Chris@82 774 E T2w;
Chris@82 775 {
Chris@82 776 E To, T2a, Tt, T2b;
Chris@82 777 {
Chris@82 778 E Tl, Tn, Tk, Tm;
Chris@82 779 Tl = ri[WS(rs, 4)];
Chris@82 780 Tn = ii[WS(rs, 4)];
Chris@82 781 Tk = W[6];
Chris@82 782 Tm = W[7];
Chris@82 783 To = FMA(Tk, Tl, Tm * Tn);
Chris@82 784 T2a = FNMS(Tm, Tl, Tk * Tn);
Chris@82 785 }
Chris@82 786 {
Chris@82 787 E Tq, Ts, Tp, Tr;
Chris@82 788 Tq = ri[WS(rs, 14)];
Chris@82 789 Ts = ii[WS(rs, 14)];
Chris@82 790 Tp = W[26];
Chris@82 791 Tr = W[27];
Chris@82 792 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@82 793 T2b = FNMS(Tr, Tq, Tp * Ts);
Chris@82 794 }
Chris@82 795 Tu = To + Tt;
Chris@82 796 T3n = T2a + T2b;
Chris@82 797 T2c = T2a - T2b;
Chris@82 798 T2r = To - Tt;
Chris@82 799 }
Chris@82 800 {
Chris@82 801 E TW, T2x, T11, T2y;
Chris@82 802 {
Chris@82 803 E TT, TV, TS, TU;
Chris@82 804 TT = ri[WS(rs, 1)];
Chris@82 805 TV = ii[WS(rs, 1)];
Chris@82 806 TS = W[0];
Chris@82 807 TU = W[1];
Chris@82 808 TW = FMA(TS, TT, TU * TV);
Chris@82 809 T2x = FNMS(TU, TT, TS * TV);
Chris@82 810 }
Chris@82 811 {
Chris@82 812 E TY, T10, TX, TZ;
Chris@82 813 TY = ri[WS(rs, 11)];
Chris@82 814 T10 = ii[WS(rs, 11)];
Chris@82 815 TX = W[20];
Chris@82 816 TZ = W[21];
Chris@82 817 T11 = FMA(TX, TY, TZ * T10);
Chris@82 818 T2y = FNMS(TZ, TY, TX * T10);
Chris@82 819 }
Chris@82 820 T12 = TW + T11;
Chris@82 821 T3r = T2x + T2y;
Chris@82 822 T2i = TW - T11;
Chris@82 823 T2z = T2x - T2y;
Chris@82 824 }
Chris@82 825 {
Chris@82 826 E Tz, T2s, TE, T2t;
Chris@82 827 {
Chris@82 828 E Tw, Ty, Tv, Tx;
Chris@82 829 Tw = ri[WS(rs, 9)];
Chris@82 830 Ty = ii[WS(rs, 9)];
Chris@82 831 Tv = W[16];
Chris@82 832 Tx = W[17];
Chris@82 833 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@82 834 T2s = FNMS(Tx, Tw, Tv * Ty);
Chris@82 835 }
Chris@82 836 {
Chris@82 837 E TB, TD, TA, TC;
Chris@82 838 TB = ri[WS(rs, 19)];
Chris@82 839 TD = ii[WS(rs, 19)];
Chris@82 840 TA = W[36];
Chris@82 841 TC = W[37];
Chris@82 842 TE = FMA(TA, TB, TC * TD);
Chris@82 843 T2t = FNMS(TC, TB, TA * TD);
Chris@82 844 }
Chris@82 845 TF = Tz + TE;
Chris@82 846 T3o = T2s + T2t;
Chris@82 847 T2d = Tz - TE;
Chris@82 848 T2u = T2s - T2t;
Chris@82 849 }
Chris@82 850 {
Chris@82 851 E TL, T2f, TQ, T2g;
Chris@82 852 {
Chris@82 853 E TI, TK, TH, TJ;
Chris@82 854 TI = ri[WS(rs, 16)];
Chris@82 855 TK = ii[WS(rs, 16)];
Chris@82 856 TH = W[30];
Chris@82 857 TJ = W[31];
Chris@82 858 TL = FMA(TH, TI, TJ * TK);
Chris@82 859 T2f = FNMS(TJ, TI, TH * TK);
Chris@82 860 }
Chris@82 861 {
Chris@82 862 E TN, TP, TM, TO;
Chris@82 863 TN = ri[WS(rs, 6)];
Chris@82 864 TP = ii[WS(rs, 6)];
Chris@82 865 TM = W[10];
Chris@82 866 TO = W[11];
Chris@82 867 TQ = FMA(TM, TN, TO * TP);
Chris@82 868 T2g = FNMS(TO, TN, TM * TP);
Chris@82 869 }
Chris@82 870 TR = TL + TQ;
Chris@82 871 T3q = T2f + T2g;
Chris@82 872 T2h = T2f - T2g;
Chris@82 873 T2w = TL - TQ;
Chris@82 874 }
Chris@82 875 TG = Tu - TF;
Chris@82 876 T13 = TR - T12;
Chris@82 877 T14 = TG + T13;
Chris@82 878 T3p = T3n - T3o;
Chris@82 879 T3s = T3q - T3r;
Chris@82 880 T43 = T3p + T3s;
Chris@82 881 T3A = T3n + T3o;
Chris@82 882 T3B = T3q + T3r;
Chris@82 883 T3J = T3A + T3B;
Chris@82 884 T1S = Tu + TF;
Chris@82 885 T1T = TR + T12;
Chris@82 886 T1U = T1S + T1T;
Chris@82 887 T2e = T2c + T2d;
Chris@82 888 T2j = T2h + T2i;
Chris@82 889 T4q = T2e + T2j;
Chris@82 890 T2T = T2c - T2d;
Chris@82 891 T2U = T2h - T2i;
Chris@82 892 T4b = T2T + T2U;
Chris@82 893 T30 = T2r + T2u;
Chris@82 894 T31 = T2w + T2z;
Chris@82 895 T32 = T30 + T31;
Chris@82 896 T2v = T2r - T2u;
Chris@82 897 T2A = T2w - T2z;
Chris@82 898 T2B = T2v + T2A;
Chris@82 899 }
Chris@82 900 {
Chris@82 901 E T3e, T1Q, T3d, T3u, T3w, T3m, T3t, T3v, T3f;
Chris@82 902 T3e = KP559016994 * (T14 - T1P);
Chris@82 903 T1Q = T14 + T1P;
Chris@82 904 T3d = FNMS(KP250000000, T1Q, Tj);
Chris@82 905 T3m = T3i - T3l;
Chris@82 906 T3t = T3p - T3s;
Chris@82 907 T3u = FNMS(KP587785252, T3t, KP951056516 * T3m);
Chris@82 908 T3w = FMA(KP951056516, T3t, KP587785252 * T3m);
Chris@82 909 ri[WS(rs, 10)] = Tj + T1Q;
Chris@82 910 T3v = T3e + T3d;
Chris@82 911 ri[WS(rs, 14)] = T3v - T3w;
Chris@82 912 ri[WS(rs, 6)] = T3v + T3w;
Chris@82 913 T3f = T3d - T3e;
Chris@82 914 ri[WS(rs, 2)] = T3f - T3u;
Chris@82 915 ri[WS(rs, 18)] = T3f + T3u;
Chris@82 916 }
Chris@82 917 {
Chris@82 918 E T47, T45, T46, T41, T4a, T3Z, T40, T49, T48;
Chris@82 919 T47 = KP559016994 * (T43 - T44);
Chris@82 920 T45 = T43 + T44;
Chris@82 921 T46 = FNMS(KP250000000, T45, T42);
Chris@82 922 T3Z = T1r - T1O;
Chris@82 923 T40 = TG - T13;
Chris@82 924 T41 = FNMS(KP587785252, T40, KP951056516 * T3Z);
Chris@82 925 T4a = FMA(KP951056516, T40, KP587785252 * T3Z);
Chris@82 926 ii[WS(rs, 10)] = T45 + T42;
Chris@82 927 T49 = T47 + T46;
Chris@82 928 ii[WS(rs, 6)] = T49 - T4a;
Chris@82 929 ii[WS(rs, 14)] = T4a + T49;
Chris@82 930 T48 = T46 - T47;
Chris@82 931 ii[WS(rs, 2)] = T41 + T48;
Chris@82 932 ii[WS(rs, 18)] = T48 - T41;
Chris@82 933 }
Chris@82 934 {
Chris@82 935 E T3x, T1Y, T3y, T3G, T3I, T3C, T3F, T3H, T3z;
Chris@82 936 T3x = KP559016994 * (T1U - T1X);
Chris@82 937 T1Y = T1U + T1X;
Chris@82 938 T3y = FNMS(KP250000000, T1Y, T1R);
Chris@82 939 T3C = T3A - T3B;
Chris@82 940 T3F = T3D - T3E;
Chris@82 941 T3G = FMA(KP951056516, T3C, KP587785252 * T3F);
Chris@82 942 T3I = FNMS(KP587785252, T3C, KP951056516 * T3F);
Chris@82 943 ri[0] = T1R + T1Y;
Chris@82 944 T3H = T3y - T3x;
Chris@82 945 ri[WS(rs, 12)] = T3H - T3I;
Chris@82 946 ri[WS(rs, 8)] = T3H + T3I;
Chris@82 947 T3z = T3x + T3y;
Chris@82 948 ri[WS(rs, 4)] = T3z - T3G;
Chris@82 949 ri[WS(rs, 16)] = T3z + T3G;
Chris@82 950 }
Chris@82 951 {
Chris@82 952 E T3U, T3L, T3V, T3T, T3Y, T3R, T3S, T3X, T3W;
Chris@82 953 T3U = KP559016994 * (T3J - T3K);
Chris@82 954 T3L = T3J + T3K;
Chris@82 955 T3V = FNMS(KP250000000, T3L, T3Q);
Chris@82 956 T3R = T1S - T1T;
Chris@82 957 T3S = T1V - T1W;
Chris@82 958 T3T = FMA(KP951056516, T3R, KP587785252 * T3S);
Chris@82 959 T3Y = FNMS(KP587785252, T3R, KP951056516 * T3S);
Chris@82 960 ii[0] = T3L + T3Q;
Chris@82 961 T3X = T3V - T3U;
Chris@82 962 ii[WS(rs, 8)] = T3X - T3Y;
Chris@82 963 ii[WS(rs, 12)] = T3Y + T3X;
Chris@82 964 T3W = T3U + T3V;
Chris@82 965 ii[WS(rs, 4)] = T3T + T3W;
Chris@82 966 ii[WS(rs, 16)] = T3W - T3T;
Chris@82 967 }
Chris@82 968 {
Chris@82 969 E T2P, T2N, T2O, T2l, T2R, T29, T2k, T2S, T2Q;
Chris@82 970 T2P = KP559016994 * (T2B - T2M);
Chris@82 971 T2N = T2B + T2M;
Chris@82 972 T2O = FNMS(KP250000000, T2N, T2q);
Chris@82 973 T29 = T23 - T28;
Chris@82 974 T2k = T2e - T2j;
Chris@82 975 T2l = FNMS(KP587785252, T2k, KP951056516 * T29);
Chris@82 976 T2R = FMA(KP951056516, T2k, KP587785252 * T29);
Chris@82 977 ri[WS(rs, 15)] = T2q + T2N;
Chris@82 978 T2S = T2P + T2O;
Chris@82 979 ri[WS(rs, 11)] = T2R + T2S;
Chris@82 980 ri[WS(rs, 19)] = T2S - T2R;
Chris@82 981 T2Q = T2O - T2P;
Chris@82 982 ri[WS(rs, 3)] = T2l + T2Q;
Chris@82 983 ri[WS(rs, 7)] = T2Q - T2l;
Chris@82 984 }
Chris@82 985 {
Chris@82 986 E T4u, T4s, T4t, T4y, T4A, T4w, T4x, T4z, T4v;
Chris@82 987 T4u = KP559016994 * (T4q - T4r);
Chris@82 988 T4s = T4q + T4r;
Chris@82 989 T4t = FNMS(KP250000000, T4s, T4p);
Chris@82 990 T4w = T2G - T2L;
Chris@82 991 T4x = T2v - T2A;
Chris@82 992 T4y = FNMS(KP587785252, T4x, KP951056516 * T4w);
Chris@82 993 T4A = FMA(KP951056516, T4x, KP587785252 * T4w);
Chris@82 994 ii[WS(rs, 15)] = T4s + T4p;
Chris@82 995 T4z = T4u + T4t;
Chris@82 996 ii[WS(rs, 11)] = T4z - T4A;
Chris@82 997 ii[WS(rs, 19)] = T4A + T4z;
Chris@82 998 T4v = T4t - T4u;
Chris@82 999 ii[WS(rs, 3)] = T4v - T4y;
Chris@82 1000 ii[WS(rs, 7)] = T4y + T4v;
Chris@82 1001 }
Chris@82 1002 {
Chris@82 1003 E T36, T38, T39, T2Z, T3b, T2V, T2Y, T3c, T3a;
Chris@82 1004 T36 = KP559016994 * (T32 - T35);
Chris@82 1005 T38 = T32 + T35;
Chris@82 1006 T39 = FNMS(KP250000000, T38, T37);
Chris@82 1007 T2V = T2T - T2U;
Chris@82 1008 T2Y = T2W - T2X;
Chris@82 1009 T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y);
Chris@82 1010 T3b = FNMS(KP587785252, T2V, KP951056516 * T2Y);
Chris@82 1011 ri[WS(rs, 5)] = T37 + T38;
Chris@82 1012 T3c = T39 - T36;
Chris@82 1013 ri[WS(rs, 13)] = T3b + T3c;
Chris@82 1014 ri[WS(rs, 17)] = T3c - T3b;
Chris@82 1015 T3a = T36 + T39;
Chris@82 1016 ri[WS(rs, 1)] = T2Z + T3a;
Chris@82 1017 ri[WS(rs, 9)] = T3a - T2Z;
Chris@82 1018 }
Chris@82 1019 {
Chris@82 1020 E T4d, T4h, T4i, T4m, T4o, T4k, T4l, T4n, T4j;
Chris@82 1021 T4d = KP559016994 * (T4b - T4c);
Chris@82 1022 T4h = T4b + T4c;
Chris@82 1023 T4i = FNMS(KP250000000, T4h, T4g);
Chris@82 1024 T4k = T30 - T31;
Chris@82 1025 T4l = T33 - T34;
Chris@82 1026 T4m = FMA(KP951056516, T4k, KP587785252 * T4l);
Chris@82 1027 T4o = FNMS(KP587785252, T4k, KP951056516 * T4l);
Chris@82 1028 ii[WS(rs, 5)] = T4h + T4g;
Chris@82 1029 T4n = T4i - T4d;
Chris@82 1030 ii[WS(rs, 13)] = T4n - T4o;
Chris@82 1031 ii[WS(rs, 17)] = T4o + T4n;
Chris@82 1032 T4j = T4d + T4i;
Chris@82 1033 ii[WS(rs, 1)] = T4j - T4m;
Chris@82 1034 ii[WS(rs, 9)] = T4m + T4j;
Chris@82 1035 }
Chris@82 1036 }
Chris@82 1037 }
Chris@82 1038 }
Chris@82 1039
Chris@82 1040 static const tw_instr twinstr[] = {
Chris@82 1041 {TW_FULL, 0, 20},
Chris@82 1042 {TW_NEXT, 1, 0}
Chris@82 1043 };
Chris@82 1044
Chris@82 1045 static const ct_desc desc = { 20, "t1_20", twinstr, &GENUS, {184, 62, 62, 0}, 0, 0, 0 };
Chris@82 1046
Chris@82 1047 void X(codelet_t1_20) (planner *p) {
Chris@82 1048 X(kdft_dit_register) (p, t1_20, &desc);
Chris@82 1049 }
Chris@82 1050 #endif