annotate src/fftw-3.3.3/dft/scalar/codelets/t2_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:36:00 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -name t2_32 -include t.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 488 FP additions, 350 FP multiplications,
Chris@10 32 * (or, 236 additions, 98 multiplications, 252 fused multiply/add),
Chris@10 33 * 181 stack variables, 7 constants, and 128 memory accesses
Chris@10 34 */
Chris@10 35 #include "t.h"
Chris@10 36
Chris@10 37 static void t2_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 40 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 41 DK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@10 42 DK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@10 43 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 44 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 45 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 46 {
Chris@10 47 INT m;
Chris@10 48 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 49 E T9A, T9z;
Chris@10 50 {
Chris@10 51 E T2, T8, T3, T6, Te, Tr, T18, T4, Ta, Tz, T1n, T10, Ti, T5, Tc;
Chris@10 52 T2 = W[0];
Chris@10 53 T8 = W[4];
Chris@10 54 T3 = W[2];
Chris@10 55 T6 = W[3];
Chris@10 56 Te = W[6];
Chris@10 57 Tr = T2 * T8;
Chris@10 58 T18 = T3 * T8;
Chris@10 59 T4 = T2 * T3;
Chris@10 60 Ta = T2 * T6;
Chris@10 61 Tz = T3 * Te;
Chris@10 62 T1n = T8 * Te;
Chris@10 63 T10 = T2 * Te;
Chris@10 64 Ti = W[7];
Chris@10 65 T5 = W[1];
Chris@10 66 Tc = W[5];
Chris@10 67 {
Chris@10 68 E T34, T31, T2X, T2T, Tq, T46, T8H, T97, TH, T98, T4b, T8D, TZ, T7f, T4j;
Chris@10 69 E T6t, T1g, T7g, T4q, T6u, T4z, T6x, T1J, T7m, T7l, T8d, T6y, T4G, T2k, T7o;
Chris@10 70 E T7r, T8e, T6A, T4O, T6B, T4V, T6P, T5E, T7L, T3G, T6M, T61, T8n, T7I, T6I;
Chris@10 71 E T55, T7A, T2N, T6F, T5s, T8i, T7x, T5L, T62, T43, T7J, T5S, T63, T7O, T8o;
Chris@10 72 E T2U, T2R, T2V, T57, T3a, T5h, T2Y, T32, T35;
Chris@10 73 {
Chris@10 74 E T1K, T23, T1N, T26, T2b, T1U, T3C, T3j, T3z, T3f, T1R, T29, TR, Th, T2J;
Chris@10 75 E T2F, Td, TP, T3r, T3n, T2w, T2s, T3Q, T3M, T1Z, T1V, T2g, T2c;
Chris@10 76 {
Chris@10 77 E T11, T1C, TM, Tb, TJ, T7, T1o, T19, T1w, T1F, T15, T1s, T1d, T1z, TW;
Chris@10 78 E TS, Ty, T48, TG, T4a;
Chris@10 79 {
Chris@10 80 E T1, TA, Ts, TE, Tw, Tn, Tj, T8G, Tk, To, T14;
Chris@10 81 T1 = ri[0];
Chris@10 82 TA = FMA(T6, Ti, Tz);
Chris@10 83 T1K = FNMS(T6, Ti, Tz);
Chris@10 84 T14 = T2 * Ti;
Chris@10 85 {
Chris@10 86 E T1r, TD, T1c, Tv;
Chris@10 87 T1r = T8 * Ti;
Chris@10 88 TD = T3 * Ti;
Chris@10 89 T11 = FNMS(T5, Ti, T10);
Chris@10 90 T1C = FMA(T5, Ti, T10);
Chris@10 91 TM = FMA(T5, T3, Ta);
Chris@10 92 Tb = FNMS(T5, T3, Ta);
Chris@10 93 TJ = FNMS(T5, T6, T4);
Chris@10 94 T7 = FMA(T5, T6, T4);
Chris@10 95 T1o = FMA(Tc, Ti, T1n);
Chris@10 96 T23 = FMA(T6, Tc, T18);
Chris@10 97 T19 = FNMS(T6, Tc, T18);
Chris@10 98 T1w = FNMS(T5, Tc, Tr);
Chris@10 99 Ts = FMA(T5, Tc, Tr);
Chris@10 100 T1c = T3 * Tc;
Chris@10 101 Tv = T2 * Tc;
Chris@10 102 T1F = FNMS(T5, Te, T14);
Chris@10 103 T15 = FMA(T5, Te, T14);
Chris@10 104 T1s = FNMS(Tc, Te, T1r);
Chris@10 105 T1N = FMA(T6, Te, TD);
Chris@10 106 TE = FNMS(T6, Te, TD);
Chris@10 107 {
Chris@10 108 E T1T, T3i, T3e, T1Q;
Chris@10 109 T1T = TJ * Tc;
Chris@10 110 T3i = TJ * Ti;
Chris@10 111 T3e = TJ * Te;
Chris@10 112 T1Q = TJ * T8;
Chris@10 113 {
Chris@10 114 E Tg, T2I, T2E, T9;
Chris@10 115 Tg = T7 * Tc;
Chris@10 116 T2I = T7 * Ti;
Chris@10 117 T2E = T7 * Te;
Chris@10 118 T9 = T7 * T8;
Chris@10 119 {
Chris@10 120 E T3q, T3m, T2v, T2r;
Chris@10 121 T3q = T19 * Ti;
Chris@10 122 T3m = T19 * Te;
Chris@10 123 T2v = T1w * Ti;
Chris@10 124 T2r = T1w * Te;
Chris@10 125 {
Chris@10 126 E T2W, T2S, T3P, T3L;
Chris@10 127 T2W = T23 * Ti;
Chris@10 128 T2S = T23 * Te;
Chris@10 129 T3P = Ts * Ti;
Chris@10 130 T3L = Ts * Te;
Chris@10 131 T26 = FNMS(T6, T8, T1c);
Chris@10 132 T1d = FMA(T6, T8, T1c);
Chris@10 133 T1z = FMA(T5, T8, Tv);
Chris@10 134 Tw = FNMS(T5, T8, Tv);
Chris@10 135 T2b = FNMS(TM, T8, T1T);
Chris@10 136 T1U = FMA(TM, T8, T1T);
Chris@10 137 T3C = FNMS(TM, Te, T3i);
Chris@10 138 T3j = FMA(TM, Te, T3i);
Chris@10 139 T3z = FMA(TM, Ti, T3e);
Chris@10 140 T3f = FNMS(TM, Ti, T3e);
Chris@10 141 T1R = FNMS(TM, Tc, T1Q);
Chris@10 142 T29 = FMA(TM, Tc, T1Q);
Chris@10 143 TR = FNMS(Tb, T8, Tg);
Chris@10 144 Th = FMA(Tb, T8, Tg);
Chris@10 145 T34 = FMA(Tb, Te, T2I);
Chris@10 146 T2J = FNMS(Tb, Te, T2I);
Chris@10 147 T31 = FNMS(Tb, Ti, T2E);
Chris@10 148 T2F = FMA(Tb, Ti, T2E);
Chris@10 149 Td = FNMS(Tb, Tc, T9);
Chris@10 150 TP = FMA(Tb, Tc, T9);
Chris@10 151 T2X = FNMS(T26, Te, T2W);
Chris@10 152 T2T = FMA(T26, Ti, T2S);
Chris@10 153 T3r = FNMS(T1d, Te, T3q);
Chris@10 154 T3n = FMA(T1d, Ti, T3m);
Chris@10 155 T2w = FNMS(T1z, Te, T2v);
Chris@10 156 T2s = FMA(T1z, Ti, T2r);
Chris@10 157 T3Q = FNMS(Tw, Te, T3P);
Chris@10 158 T3M = FMA(Tw, Ti, T3L);
Chris@10 159 {
Chris@10 160 E T1Y, T1S, T2f, T2a;
Chris@10 161 T1Y = T1R * Ti;
Chris@10 162 T1S = T1R * Te;
Chris@10 163 T2f = T29 * Ti;
Chris@10 164 T2a = T29 * Te;
Chris@10 165 {
Chris@10 166 E Tm, Tf, TV, TQ;
Chris@10 167 Tm = Td * Ti;
Chris@10 168 Tf = Td * Te;
Chris@10 169 TV = TP * Ti;
Chris@10 170 TQ = TP * Te;
Chris@10 171 T1Z = FNMS(T1U, Te, T1Y);
Chris@10 172 T1V = FMA(T1U, Ti, T1S);
Chris@10 173 T2g = FNMS(T2b, Te, T2f);
Chris@10 174 T2c = FMA(T2b, Ti, T2a);
Chris@10 175 Tn = FNMS(Th, Te, Tm);
Chris@10 176 Tj = FMA(Th, Ti, Tf);
Chris@10 177 TW = FNMS(TR, Te, TV);
Chris@10 178 TS = FMA(TR, Ti, TQ);
Chris@10 179 T8G = ii[0];
Chris@10 180 }
Chris@10 181 }
Chris@10 182 }
Chris@10 183 }
Chris@10 184 }
Chris@10 185 }
Chris@10 186 }
Chris@10 187 Tk = ri[WS(rs, 16)];
Chris@10 188 To = ii[WS(rs, 16)];
Chris@10 189 {
Chris@10 190 E Tt, Tx, Tu, T47, TB, TF, TC, T49;
Chris@10 191 {
Chris@10 192 E Tl, T8E, Tp, T8F;
Chris@10 193 Tt = ri[WS(rs, 8)];
Chris@10 194 Tx = ii[WS(rs, 8)];
Chris@10 195 Tl = Tj * Tk;
Chris@10 196 T8E = Tj * To;
Chris@10 197 Tu = Ts * Tt;
Chris@10 198 T47 = Ts * Tx;
Chris@10 199 Tp = FMA(Tn, To, Tl);
Chris@10 200 T8F = FNMS(Tn, Tk, T8E);
Chris@10 201 TB = ri[WS(rs, 24)];
Chris@10 202 TF = ii[WS(rs, 24)];
Chris@10 203 Tq = T1 + Tp;
Chris@10 204 T46 = T1 - Tp;
Chris@10 205 T8H = T8F + T8G;
Chris@10 206 T97 = T8G - T8F;
Chris@10 207 TC = TA * TB;
Chris@10 208 T49 = TA * TF;
Chris@10 209 }
Chris@10 210 Ty = FMA(Tw, Tx, Tu);
Chris@10 211 T48 = FNMS(Tw, Tt, T47);
Chris@10 212 TG = FMA(TE, TF, TC);
Chris@10 213 T4a = FNMS(TE, TB, T49);
Chris@10 214 }
Chris@10 215 }
Chris@10 216 {
Chris@10 217 E TT, TX, TO, T4f, TU, T4g;
Chris@10 218 {
Chris@10 219 E TK, TN, TL, T4e;
Chris@10 220 TK = ri[WS(rs, 4)];
Chris@10 221 TN = ii[WS(rs, 4)];
Chris@10 222 TH = Ty + TG;
Chris@10 223 T98 = Ty - TG;
Chris@10 224 T4b = T48 - T4a;
Chris@10 225 T8D = T48 + T4a;
Chris@10 226 TL = TJ * TK;
Chris@10 227 T4e = TJ * TN;
Chris@10 228 TT = ri[WS(rs, 20)];
Chris@10 229 TX = ii[WS(rs, 20)];
Chris@10 230 TO = FMA(TM, TN, TL);
Chris@10 231 T4f = FNMS(TM, TK, T4e);
Chris@10 232 TU = TS * TT;
Chris@10 233 T4g = TS * TX;
Chris@10 234 }
Chris@10 235 {
Chris@10 236 E T17, T4m, T1a, T1e, T4d, T4i;
Chris@10 237 {
Chris@10 238 E T12, T16, TY, T4h, T13, T4l;
Chris@10 239 T12 = ri[WS(rs, 28)];
Chris@10 240 T16 = ii[WS(rs, 28)];
Chris@10 241 TY = FMA(TW, TX, TU);
Chris@10 242 T4h = FNMS(TW, TT, T4g);
Chris@10 243 T13 = T11 * T12;
Chris@10 244 T4l = T11 * T16;
Chris@10 245 TZ = TO + TY;
Chris@10 246 T4d = TO - TY;
Chris@10 247 T7f = T4f + T4h;
Chris@10 248 T4i = T4f - T4h;
Chris@10 249 T17 = FMA(T15, T16, T13);
Chris@10 250 T4m = FNMS(T15, T12, T4l);
Chris@10 251 }
Chris@10 252 T4j = T4d + T4i;
Chris@10 253 T6t = T4i - T4d;
Chris@10 254 T1a = ri[WS(rs, 12)];
Chris@10 255 T1e = ii[WS(rs, 12)];
Chris@10 256 {
Chris@10 257 E T1m, T4u, T1H, T4E, T1x, T1A, T1u, T4w, T1y, T4B;
Chris@10 258 {
Chris@10 259 E T1D, T1G, T1E, T4D;
Chris@10 260 {
Chris@10 261 E T1f, T4o, T4k, T4p;
Chris@10 262 {
Chris@10 263 E T1j, T1l, T1b, T4n, T1k, T4t;
Chris@10 264 T1j = ri[WS(rs, 2)];
Chris@10 265 T1l = ii[WS(rs, 2)];
Chris@10 266 T1b = T19 * T1a;
Chris@10 267 T4n = T19 * T1e;
Chris@10 268 T1k = T7 * T1j;
Chris@10 269 T4t = T7 * T1l;
Chris@10 270 T1f = FMA(T1d, T1e, T1b);
Chris@10 271 T4o = FNMS(T1d, T1a, T4n);
Chris@10 272 T1m = FMA(Tb, T1l, T1k);
Chris@10 273 T4u = FNMS(Tb, T1j, T4t);
Chris@10 274 }
Chris@10 275 T1g = T17 + T1f;
Chris@10 276 T4k = T17 - T1f;
Chris@10 277 T7g = T4m + T4o;
Chris@10 278 T4p = T4m - T4o;
Chris@10 279 T1D = ri[WS(rs, 26)];
Chris@10 280 T1G = ii[WS(rs, 26)];
Chris@10 281 T4q = T4k - T4p;
Chris@10 282 T6u = T4k + T4p;
Chris@10 283 T1E = T1C * T1D;
Chris@10 284 T4D = T1C * T1G;
Chris@10 285 }
Chris@10 286 {
Chris@10 287 E T1p, T1t, T1q, T4v;
Chris@10 288 T1p = ri[WS(rs, 18)];
Chris@10 289 T1t = ii[WS(rs, 18)];
Chris@10 290 T1H = FMA(T1F, T1G, T1E);
Chris@10 291 T4E = FNMS(T1F, T1D, T4D);
Chris@10 292 T1q = T1o * T1p;
Chris@10 293 T4v = T1o * T1t;
Chris@10 294 T1x = ri[WS(rs, 10)];
Chris@10 295 T1A = ii[WS(rs, 10)];
Chris@10 296 T1u = FMA(T1s, T1t, T1q);
Chris@10 297 T4w = FNMS(T1s, T1p, T4v);
Chris@10 298 T1y = T1w * T1x;
Chris@10 299 T4B = T1w * T1A;
Chris@10 300 }
Chris@10 301 }
Chris@10 302 {
Chris@10 303 E T4A, T1v, T7j, T4x, T1B, T4C;
Chris@10 304 T4A = T1m - T1u;
Chris@10 305 T1v = T1m + T1u;
Chris@10 306 T7j = T4u + T4w;
Chris@10 307 T4x = T4u - T4w;
Chris@10 308 T1B = FMA(T1z, T1A, T1y);
Chris@10 309 T4C = FNMS(T1z, T1x, T4B);
Chris@10 310 {
Chris@10 311 E T1I, T4y, T4F, T7k;
Chris@10 312 T1I = T1B + T1H;
Chris@10 313 T4y = T1B - T1H;
Chris@10 314 T4F = T4C - T4E;
Chris@10 315 T7k = T4C + T4E;
Chris@10 316 T4z = T4x - T4y;
Chris@10 317 T6x = T4x + T4y;
Chris@10 318 T1J = T1v + T1I;
Chris@10 319 T7m = T1v - T1I;
Chris@10 320 T7l = T7j - T7k;
Chris@10 321 T8d = T7j + T7k;
Chris@10 322 T6y = T4A - T4F;
Chris@10 323 T4G = T4A + T4F;
Chris@10 324 }
Chris@10 325 }
Chris@10 326 }
Chris@10 327 }
Chris@10 328 }
Chris@10 329 }
Chris@10 330 {
Chris@10 331 E T5Z, T3u, T5V, T5C, T7G, T5D, T3F, T5X, T4P, T4U;
Chris@10 332 {
Chris@10 333 E T1P, T4J, T2i, T4T, T21, T4L, T28, T4R;
Chris@10 334 {
Chris@10 335 E T1L, T1O, T1W, T20;
Chris@10 336 T1L = ri[WS(rs, 30)];
Chris@10 337 T1O = ii[WS(rs, 30)];
Chris@10 338 {
Chris@10 339 E T2d, T2h, T1M, T4I, T2e, T4S;
Chris@10 340 T2d = ri[WS(rs, 22)];
Chris@10 341 T2h = ii[WS(rs, 22)];
Chris@10 342 T1M = T1K * T1L;
Chris@10 343 T4I = T1K * T1O;
Chris@10 344 T2e = T2c * T2d;
Chris@10 345 T4S = T2c * T2h;
Chris@10 346 T1P = FMA(T1N, T1O, T1M);
Chris@10 347 T4J = FNMS(T1N, T1L, T4I);
Chris@10 348 T2i = FMA(T2g, T2h, T2e);
Chris@10 349 T4T = FNMS(T2g, T2d, T4S);
Chris@10 350 }
Chris@10 351 T1W = ri[WS(rs, 14)];
Chris@10 352 T20 = ii[WS(rs, 14)];
Chris@10 353 {
Chris@10 354 E T24, T27, T1X, T4K, T25, T4Q;
Chris@10 355 T24 = ri[WS(rs, 6)];
Chris@10 356 T27 = ii[WS(rs, 6)];
Chris@10 357 T1X = T1V * T1W;
Chris@10 358 T4K = T1V * T20;
Chris@10 359 T25 = T23 * T24;
Chris@10 360 T4Q = T23 * T27;
Chris@10 361 T21 = FMA(T1Z, T20, T1X);
Chris@10 362 T4L = FNMS(T1Z, T1W, T4K);
Chris@10 363 T28 = FMA(T26, T27, T25);
Chris@10 364 T4R = FNMS(T26, T24, T4Q);
Chris@10 365 }
Chris@10 366 }
Chris@10 367 {
Chris@10 368 E T22, T7p, T4M, T4N, T2j, T7q;
Chris@10 369 T4P = T1P - T21;
Chris@10 370 T22 = T1P + T21;
Chris@10 371 T7p = T4J + T4L;
Chris@10 372 T4M = T4J - T4L;
Chris@10 373 T4N = T28 - T2i;
Chris@10 374 T2j = T28 + T2i;
Chris@10 375 T7q = T4R + T4T;
Chris@10 376 T4U = T4R - T4T;
Chris@10 377 T2k = T22 + T2j;
Chris@10 378 T7o = T22 - T2j;
Chris@10 379 T7r = T7p - T7q;
Chris@10 380 T8e = T7p + T7q;
Chris@10 381 T6A = T4M + T4N;
Chris@10 382 T4O = T4M - T4N;
Chris@10 383 }
Chris@10 384 }
Chris@10 385 {
Chris@10 386 E T3l, T5z, T3E, T3v, T3t, T3w, T3x, T5B, T3A, T3B, T3D, T3y, T5W;
Chris@10 387 {
Chris@10 388 E T3g, T3k, T3h, T5y;
Chris@10 389 T3g = ri[WS(rs, 31)];
Chris@10 390 T3k = ii[WS(rs, 31)];
Chris@10 391 T3A = ri[WS(rs, 23)];
Chris@10 392 T6B = T4P - T4U;
Chris@10 393 T4V = T4P + T4U;
Chris@10 394 T3h = T3f * T3g;
Chris@10 395 T5y = T3f * T3k;
Chris@10 396 T3B = T3z * T3A;
Chris@10 397 T3D = ii[WS(rs, 23)];
Chris@10 398 T3l = FMA(T3j, T3k, T3h);
Chris@10 399 T5z = FNMS(T3j, T3g, T5y);
Chris@10 400 }
Chris@10 401 {
Chris@10 402 E T3o, T5Y, T3s, T3p, T5A;
Chris@10 403 T3o = ri[WS(rs, 15)];
Chris@10 404 T3E = FMA(T3C, T3D, T3B);
Chris@10 405 T5Y = T3z * T3D;
Chris@10 406 T3s = ii[WS(rs, 15)];
Chris@10 407 T3p = T3n * T3o;
Chris@10 408 T3v = ri[WS(rs, 7)];
Chris@10 409 T5Z = FNMS(T3C, T3A, T5Y);
Chris@10 410 T5A = T3n * T3s;
Chris@10 411 T3t = FMA(T3r, T3s, T3p);
Chris@10 412 T3w = TP * T3v;
Chris@10 413 T3x = ii[WS(rs, 7)];
Chris@10 414 T5B = FNMS(T3r, T3o, T5A);
Chris@10 415 }
Chris@10 416 T3u = T3l + T3t;
Chris@10 417 T5V = T3l - T3t;
Chris@10 418 T3y = FMA(TR, T3x, T3w);
Chris@10 419 T5W = TP * T3x;
Chris@10 420 T5C = T5z - T5B;
Chris@10 421 T7G = T5z + T5B;
Chris@10 422 T5D = T3y - T3E;
Chris@10 423 T3F = T3y + T3E;
Chris@10 424 T5X = FNMS(TR, T3v, T5W);
Chris@10 425 }
Chris@10 426 {
Chris@10 427 E T2L, T5q, T5m, T2z, T7v, T53, T2D, T5o;
Chris@10 428 {
Chris@10 429 E T2q, T50, T2y, T2A, T2C, T52, T2B, T5n;
Chris@10 430 {
Chris@10 431 E T2G, T2K, T2n, T4Z, T2t, T51;
Chris@10 432 {
Chris@10 433 E T2o, T2p, T60, T7H;
Chris@10 434 T2n = ri[WS(rs, 1)];
Chris@10 435 T6P = T5C + T5D;
Chris@10 436 T5E = T5C - T5D;
Chris@10 437 T7L = T3u - T3F;
Chris@10 438 T3G = T3u + T3F;
Chris@10 439 T60 = T5X - T5Z;
Chris@10 440 T7H = T5X + T5Z;
Chris@10 441 T2o = T2 * T2n;
Chris@10 442 T2p = ii[WS(rs, 1)];
Chris@10 443 T6M = T5V - T60;
Chris@10 444 T61 = T5V + T60;
Chris@10 445 T8n = T7G + T7H;
Chris@10 446 T7I = T7G - T7H;
Chris@10 447 T4Z = T2 * T2p;
Chris@10 448 T2q = FMA(T5, T2p, T2o);
Chris@10 449 }
Chris@10 450 T2G = ri[WS(rs, 25)];
Chris@10 451 T2K = ii[WS(rs, 25)];
Chris@10 452 T50 = FNMS(T5, T2n, T4Z);
Chris@10 453 {
Chris@10 454 E T2x, T2u, T2H, T5p;
Chris@10 455 T2t = ri[WS(rs, 17)];
Chris@10 456 T2H = T2F * T2G;
Chris@10 457 T5p = T2F * T2K;
Chris@10 458 T2x = ii[WS(rs, 17)];
Chris@10 459 T2u = T2s * T2t;
Chris@10 460 T2L = FMA(T2J, T2K, T2H);
Chris@10 461 T5q = FNMS(T2J, T2G, T5p);
Chris@10 462 T51 = T2s * T2x;
Chris@10 463 T2y = FMA(T2w, T2x, T2u);
Chris@10 464 }
Chris@10 465 T2A = ri[WS(rs, 9)];
Chris@10 466 T2C = ii[WS(rs, 9)];
Chris@10 467 T52 = FNMS(T2w, T2t, T51);
Chris@10 468 }
Chris@10 469 T5m = T2q - T2y;
Chris@10 470 T2z = T2q + T2y;
Chris@10 471 T2B = T8 * T2A;
Chris@10 472 T5n = T8 * T2C;
Chris@10 473 T7v = T50 + T52;
Chris@10 474 T53 = T50 - T52;
Chris@10 475 T2D = FMA(Tc, T2C, T2B);
Chris@10 476 T5o = FNMS(Tc, T2A, T5n);
Chris@10 477 }
Chris@10 478 {
Chris@10 479 E T3N, T3K, T3O, T5G, T41, T5Q, T3R, T3U, T3W;
Chris@10 480 {
Chris@10 481 E T3H, T3I, T3J, T3Y, T40, T5F, T3Z, T5P;
Chris@10 482 T3H = ri[WS(rs, 3)];
Chris@10 483 {
Chris@10 484 E T54, T2M, T5r, T7w;
Chris@10 485 T54 = T2D - T2L;
Chris@10 486 T2M = T2D + T2L;
Chris@10 487 T5r = T5o - T5q;
Chris@10 488 T7w = T5o + T5q;
Chris@10 489 T6I = T53 + T54;
Chris@10 490 T55 = T53 - T54;
Chris@10 491 T7A = T2z - T2M;
Chris@10 492 T2N = T2z + T2M;
Chris@10 493 T6F = T5m - T5r;
Chris@10 494 T5s = T5m + T5r;
Chris@10 495 T8i = T7v + T7w;
Chris@10 496 T7x = T7v - T7w;
Chris@10 497 T3I = T3 * T3H;
Chris@10 498 }
Chris@10 499 T3J = ii[WS(rs, 3)];
Chris@10 500 T3Y = ri[WS(rs, 11)];
Chris@10 501 T40 = ii[WS(rs, 11)];
Chris@10 502 T3N = ri[WS(rs, 19)];
Chris@10 503 T3K = FMA(T6, T3J, T3I);
Chris@10 504 T5F = T3 * T3J;
Chris@10 505 T3Z = Td * T3Y;
Chris@10 506 T5P = Td * T40;
Chris@10 507 T3O = T3M * T3N;
Chris@10 508 T5G = FNMS(T6, T3H, T5F);
Chris@10 509 T41 = FMA(Th, T40, T3Z);
Chris@10 510 T5Q = FNMS(Th, T3Y, T5P);
Chris@10 511 T3R = ii[WS(rs, 19)];
Chris@10 512 T3U = ri[WS(rs, 27)];
Chris@10 513 T3W = ii[WS(rs, 27)];
Chris@10 514 }
Chris@10 515 {
Chris@10 516 E T2O, T2P, T2Q, T37, T39, T56, T38, T5g;
Chris@10 517 {
Chris@10 518 E T3T, T5K, T5I, T3X, T5O, T7M, T5J;
Chris@10 519 T2O = ri[WS(rs, 5)];
Chris@10 520 {
Chris@10 521 E T3S, T5H, T3V, T5N;
Chris@10 522 T3S = FMA(T3Q, T3R, T3O);
Chris@10 523 T5H = T3M * T3R;
Chris@10 524 T3V = Te * T3U;
Chris@10 525 T5N = Te * T3W;
Chris@10 526 T3T = T3K + T3S;
Chris@10 527 T5K = T3K - T3S;
Chris@10 528 T5I = FNMS(T3Q, T3N, T5H);
Chris@10 529 T3X = FMA(Ti, T3W, T3V);
Chris@10 530 T5O = FNMS(Ti, T3U, T5N);
Chris@10 531 T2P = T29 * T2O;
Chris@10 532 }
Chris@10 533 T7M = T5G + T5I;
Chris@10 534 T5J = T5G - T5I;
Chris@10 535 {
Chris@10 536 E T42, T5M, T7N, T5R;
Chris@10 537 T42 = T3X + T41;
Chris@10 538 T5M = T3X - T41;
Chris@10 539 T7N = T5O + T5Q;
Chris@10 540 T5R = T5O - T5Q;
Chris@10 541 T5L = T5J - T5K;
Chris@10 542 T62 = T5K + T5J;
Chris@10 543 T43 = T3T + T42;
Chris@10 544 T7J = T42 - T3T;
Chris@10 545 T5S = T5M + T5R;
Chris@10 546 T63 = T5M - T5R;
Chris@10 547 T7O = T7M - T7N;
Chris@10 548 T8o = T7M + T7N;
Chris@10 549 T2Q = ii[WS(rs, 5)];
Chris@10 550 }
Chris@10 551 }
Chris@10 552 T37 = ri[WS(rs, 13)];
Chris@10 553 T39 = ii[WS(rs, 13)];
Chris@10 554 T2U = ri[WS(rs, 21)];
Chris@10 555 T2R = FMA(T2b, T2Q, T2P);
Chris@10 556 T56 = T29 * T2Q;
Chris@10 557 T38 = T1R * T37;
Chris@10 558 T5g = T1R * T39;
Chris@10 559 T2V = T2T * T2U;
Chris@10 560 T57 = FNMS(T2b, T2O, T56);
Chris@10 561 T3a = FMA(T1U, T39, T38);
Chris@10 562 T5h = FNMS(T1U, T37, T5g);
Chris@10 563 T2Y = ii[WS(rs, 21)];
Chris@10 564 T32 = ri[WS(rs, 29)];
Chris@10 565 T35 = ii[WS(rs, 29)];
Chris@10 566 }
Chris@10 567 }
Chris@10 568 }
Chris@10 569 }
Chris@10 570 }
Chris@10 571 {
Chris@10 572 E T5c, T5t, T5j, T5u, T88, T90, T8Z, T8b;
Chris@10 573 {
Chris@10 574 E T7e, T8T, T7y, T7D, T7h, T8U, T8S, T8R;
Chris@10 575 {
Chris@10 576 E T8c, T1i, T8A, T8z, T8O, T8J, T8N, T2l, T8L, T45, T8t, T8l, T8u, T8q, T3c;
Chris@10 577 E T8k, T8p, T8w, T2m;
Chris@10 578 {
Chris@10 579 E T8x, T8y, T8j, T8C, T8I;
Chris@10 580 {
Chris@10 581 E TI, T30, T5b, T59, T36, T5f, T1h, T7B, T5a;
Chris@10 582 TI = Tq + TH;
Chris@10 583 T7e = Tq - TH;
Chris@10 584 {
Chris@10 585 E T2Z, T58, T33, T5e;
Chris@10 586 T2Z = FMA(T2X, T2Y, T2V);
Chris@10 587 T58 = T2T * T2Y;
Chris@10 588 T33 = T31 * T32;
Chris@10 589 T5e = T31 * T35;
Chris@10 590 T30 = T2R + T2Z;
Chris@10 591 T5b = T2R - T2Z;
Chris@10 592 T59 = FNMS(T2X, T2U, T58);
Chris@10 593 T36 = FMA(T34, T35, T33);
Chris@10 594 T5f = FNMS(T34, T32, T5e);
Chris@10 595 T1h = TZ + T1g;
Chris@10 596 T8T = T1g - TZ;
Chris@10 597 }
Chris@10 598 T7B = T57 + T59;
Chris@10 599 T5a = T57 - T59;
Chris@10 600 {
Chris@10 601 E T3b, T5d, T7C, T5i;
Chris@10 602 T3b = T36 + T3a;
Chris@10 603 T5d = T36 - T3a;
Chris@10 604 T7C = T5f + T5h;
Chris@10 605 T5i = T5f - T5h;
Chris@10 606 T5c = T5a - T5b;
Chris@10 607 T5t = T5b + T5a;
Chris@10 608 T3c = T30 + T3b;
Chris@10 609 T7y = T3b - T30;
Chris@10 610 T5j = T5d + T5i;
Chris@10 611 T5u = T5d - T5i;
Chris@10 612 T7D = T7B - T7C;
Chris@10 613 T8j = T7B + T7C;
Chris@10 614 T8c = TI - T1h;
Chris@10 615 T1i = TI + T1h;
Chris@10 616 }
Chris@10 617 }
Chris@10 618 T8k = T8i - T8j;
Chris@10 619 T8x = T8i + T8j;
Chris@10 620 T8y = T8n + T8o;
Chris@10 621 T8p = T8n - T8o;
Chris@10 622 T7h = T7f - T7g;
Chris@10 623 T8C = T7f + T7g;
Chris@10 624 T8I = T8D + T8H;
Chris@10 625 T8U = T8H - T8D;
Chris@10 626 T8A = T8x + T8y;
Chris@10 627 T8z = T8x - T8y;
Chris@10 628 T8O = T8I - T8C;
Chris@10 629 T8J = T8C + T8I;
Chris@10 630 }
Chris@10 631 {
Chris@10 632 E T8h, T8m, T3d, T44;
Chris@10 633 T8h = T2N - T3c;
Chris@10 634 T3d = T2N + T3c;
Chris@10 635 T44 = T3G + T43;
Chris@10 636 T8m = T3G - T43;
Chris@10 637 T8N = T2k - T1J;
Chris@10 638 T2l = T1J + T2k;
Chris@10 639 T8L = T44 - T3d;
Chris@10 640 T45 = T3d + T44;
Chris@10 641 T8t = T8k - T8h;
Chris@10 642 T8l = T8h + T8k;
Chris@10 643 T8u = T8m + T8p;
Chris@10 644 T8q = T8m - T8p;
Chris@10 645 }
Chris@10 646 T8w = T1i - T2l;
Chris@10 647 T2m = T1i + T2l;
Chris@10 648 {
Chris@10 649 E T8s, T8P, T8Q, T8v;
Chris@10 650 {
Chris@10 651 E T8r, T8M, T8K, T8g, T8B, T8f;
Chris@10 652 T8S = T8q - T8l;
Chris@10 653 T8r = T8l + T8q;
Chris@10 654 T8B = T8d + T8e;
Chris@10 655 T8f = T8d - T8e;
Chris@10 656 ri[0] = T2m + T45;
Chris@10 657 ri[WS(rs, 16)] = T2m - T45;
Chris@10 658 ri[WS(rs, 8)] = T8w + T8z;
Chris@10 659 ri[WS(rs, 24)] = T8w - T8z;
Chris@10 660 T8M = T8J - T8B;
Chris@10 661 T8K = T8B + T8J;
Chris@10 662 T8g = T8c + T8f;
Chris@10 663 T8s = T8c - T8f;
Chris@10 664 T8R = T8O - T8N;
Chris@10 665 T8P = T8N + T8O;
Chris@10 666 ii[WS(rs, 24)] = T8M - T8L;
Chris@10 667 ii[WS(rs, 8)] = T8L + T8M;
Chris@10 668 ii[WS(rs, 16)] = T8K - T8A;
Chris@10 669 ii[0] = T8A + T8K;
Chris@10 670 ri[WS(rs, 4)] = FMA(KP707106781, T8r, T8g);
Chris@10 671 ri[WS(rs, 20)] = FNMS(KP707106781, T8r, T8g);
Chris@10 672 T8Q = T8t + T8u;
Chris@10 673 T8v = T8t - T8u;
Chris@10 674 }
Chris@10 675 ii[WS(rs, 20)] = FNMS(KP707106781, T8Q, T8P);
Chris@10 676 ii[WS(rs, 4)] = FMA(KP707106781, T8Q, T8P);
Chris@10 677 ri[WS(rs, 12)] = FMA(KP707106781, T8v, T8s);
Chris@10 678 ri[WS(rs, 28)] = FNMS(KP707106781, T8v, T8s);
Chris@10 679 }
Chris@10 680 }
Chris@10 681 {
Chris@10 682 E T7P, T7W, T7i, T7K, T8a, T86, T91, T8V, T8W, T7t, T7T, T7F, T92, T7Z, T89;
Chris@10 683 E T83;
Chris@10 684 {
Chris@10 685 E T7X, T7n, T7s, T7Y, T84, T85;
Chris@10 686 T7P = T7L - T7O;
Chris@10 687 T84 = T7L + T7O;
Chris@10 688 ii[WS(rs, 28)] = FNMS(KP707106781, T8S, T8R);
Chris@10 689 ii[WS(rs, 12)] = FMA(KP707106781, T8S, T8R);
Chris@10 690 T7W = T7e + T7h;
Chris@10 691 T7i = T7e - T7h;
Chris@10 692 T85 = T7I + T7J;
Chris@10 693 T7K = T7I - T7J;
Chris@10 694 T7X = T7m + T7l;
Chris@10 695 T7n = T7l - T7m;
Chris@10 696 T8a = FMA(KP414213562, T84, T85);
Chris@10 697 T86 = FNMS(KP414213562, T85, T84);
Chris@10 698 T91 = T8U - T8T;
Chris@10 699 T8V = T8T + T8U;
Chris@10 700 T7s = T7o + T7r;
Chris@10 701 T7Y = T7o - T7r;
Chris@10 702 {
Chris@10 703 E T82, T81, T7z, T7E;
Chris@10 704 T82 = T7x + T7y;
Chris@10 705 T7z = T7x - T7y;
Chris@10 706 T7E = T7A - T7D;
Chris@10 707 T81 = T7A + T7D;
Chris@10 708 T8W = T7n + T7s;
Chris@10 709 T7t = T7n - T7s;
Chris@10 710 T7T = FNMS(KP414213562, T7z, T7E);
Chris@10 711 T7F = FMA(KP414213562, T7E, T7z);
Chris@10 712 T92 = T7Y - T7X;
Chris@10 713 T7Z = T7X + T7Y;
Chris@10 714 T89 = FNMS(KP414213562, T81, T82);
Chris@10 715 T83 = FMA(KP414213562, T82, T81);
Chris@10 716 }
Chris@10 717 }
Chris@10 718 {
Chris@10 719 E T7S, T7u, T93, T95, T7U, T7Q;
Chris@10 720 T7S = FNMS(KP707106781, T7t, T7i);
Chris@10 721 T7u = FMA(KP707106781, T7t, T7i);
Chris@10 722 T93 = FMA(KP707106781, T92, T91);
Chris@10 723 T95 = FNMS(KP707106781, T92, T91);
Chris@10 724 T7U = FMA(KP414213562, T7K, T7P);
Chris@10 725 T7Q = FNMS(KP414213562, T7P, T7K);
Chris@10 726 {
Chris@10 727 E T80, T87, T8X, T8Y;
Chris@10 728 T88 = FNMS(KP707106781, T7Z, T7W);
Chris@10 729 T80 = FMA(KP707106781, T7Z, T7W);
Chris@10 730 {
Chris@10 731 E T7V, T94, T96, T7R;
Chris@10 732 T7V = T7T + T7U;
Chris@10 733 T94 = T7U - T7T;
Chris@10 734 T96 = T7F + T7Q;
Chris@10 735 T7R = T7F - T7Q;
Chris@10 736 ri[WS(rs, 30)] = FMA(KP923879532, T7V, T7S);
Chris@10 737 ri[WS(rs, 14)] = FNMS(KP923879532, T7V, T7S);
Chris@10 738 ii[WS(rs, 22)] = FNMS(KP923879532, T94, T93);
Chris@10 739 ii[WS(rs, 6)] = FMA(KP923879532, T94, T93);
Chris@10 740 ii[WS(rs, 30)] = FMA(KP923879532, T96, T95);
Chris@10 741 ii[WS(rs, 14)] = FNMS(KP923879532, T96, T95);
Chris@10 742 ri[WS(rs, 6)] = FMA(KP923879532, T7R, T7u);
Chris@10 743 ri[WS(rs, 22)] = FNMS(KP923879532, T7R, T7u);
Chris@10 744 T87 = T83 + T86;
Chris@10 745 T90 = T86 - T83;
Chris@10 746 }
Chris@10 747 T8Z = FNMS(KP707106781, T8W, T8V);
Chris@10 748 T8X = FMA(KP707106781, T8W, T8V);
Chris@10 749 T8Y = T89 + T8a;
Chris@10 750 T8b = T89 - T8a;
Chris@10 751 ri[WS(rs, 2)] = FMA(KP923879532, T87, T80);
Chris@10 752 ri[WS(rs, 18)] = FNMS(KP923879532, T87, T80);
Chris@10 753 ii[WS(rs, 18)] = FNMS(KP923879532, T8Y, T8X);
Chris@10 754 ii[WS(rs, 2)] = FMA(KP923879532, T8Y, T8X);
Chris@10 755 }
Chris@10 756 }
Chris@10 757 }
Chris@10 758 }
Chris@10 759 {
Chris@10 760 E T6s, T9o, T9n, T6v, T6N, T6Q, T6G, T6J, T9g, T9f;
Chris@10 761 {
Chris@10 762 E T6c, T4s, T9c, T4X, T9h, T9b, T9i, T6f, T5U, T6l, T64, T5k, T5v;
Chris@10 763 {
Chris@10 764 E T6d, T6e, T99, T9a, T5T;
Chris@10 765 {
Chris@10 766 E T4c, T4r, T4H, T4W;
Chris@10 767 T6s = T46 - T4b;
Chris@10 768 T4c = T46 + T4b;
Chris@10 769 ri[WS(rs, 10)] = FMA(KP923879532, T8b, T88);
Chris@10 770 ri[WS(rs, 26)] = FNMS(KP923879532, T8b, T88);
Chris@10 771 ii[WS(rs, 26)] = FNMS(KP923879532, T90, T8Z);
Chris@10 772 ii[WS(rs, 10)] = FMA(KP923879532, T90, T8Z);
Chris@10 773 T4r = T4j + T4q;
Chris@10 774 T9o = T4q - T4j;
Chris@10 775 T6d = FMA(KP414213562, T4z, T4G);
Chris@10 776 T4H = FNMS(KP414213562, T4G, T4z);
Chris@10 777 T4W = FMA(KP414213562, T4V, T4O);
Chris@10 778 T6e = FNMS(KP414213562, T4O, T4V);
Chris@10 779 T9n = T98 + T97;
Chris@10 780 T99 = T97 - T98;
Chris@10 781 T6c = FMA(KP707106781, T4r, T4c);
Chris@10 782 T4s = FNMS(KP707106781, T4r, T4c);
Chris@10 783 T9c = T4H + T4W;
Chris@10 784 T4X = T4H - T4W;
Chris@10 785 T9a = T6t + T6u;
Chris@10 786 T6v = T6t - T6u;
Chris@10 787 }
Chris@10 788 T6N = T5S - T5L;
Chris@10 789 T5T = T5L + T5S;
Chris@10 790 T9h = FNMS(KP707106781, T9a, T99);
Chris@10 791 T9b = FMA(KP707106781, T9a, T99);
Chris@10 792 T9i = T6e - T6d;
Chris@10 793 T6f = T6d + T6e;
Chris@10 794 T5U = FNMS(KP707106781, T5T, T5E);
Chris@10 795 T6l = FMA(KP707106781, T5T, T5E);
Chris@10 796 T64 = T62 + T63;
Chris@10 797 T6Q = T62 - T63;
Chris@10 798 T6G = T5j - T5c;
Chris@10 799 T5k = T5c + T5j;
Chris@10 800 T5v = T5t + T5u;
Chris@10 801 T6J = T5t - T5u;
Chris@10 802 }
Chris@10 803 {
Chris@10 804 E T6m, T6q, T6j, T6p, T9l, T9m;
Chris@10 805 {
Chris@10 806 E T68, T4Y, T6a, T66, T69, T5x, T9j, T6k, T65, T9k, T6b, T67;
Chris@10 807 T68 = FNMS(KP923879532, T4X, T4s);
Chris@10 808 T4Y = FMA(KP923879532, T4X, T4s);
Chris@10 809 T6k = FMA(KP707106781, T64, T61);
Chris@10 810 T65 = FNMS(KP707106781, T64, T61);
Chris@10 811 {
Chris@10 812 E T6i, T5l, T6h, T5w;
Chris@10 813 T6i = FMA(KP707106781, T5k, T55);
Chris@10 814 T5l = FNMS(KP707106781, T5k, T55);
Chris@10 815 T6h = FMA(KP707106781, T5v, T5s);
Chris@10 816 T5w = FNMS(KP707106781, T5v, T5s);
Chris@10 817 T6m = FNMS(KP198912367, T6l, T6k);
Chris@10 818 T6q = FMA(KP198912367, T6k, T6l);
Chris@10 819 T6a = FMA(KP668178637, T5U, T65);
Chris@10 820 T66 = FNMS(KP668178637, T65, T5U);
Chris@10 821 T6j = FMA(KP198912367, T6i, T6h);
Chris@10 822 T6p = FNMS(KP198912367, T6h, T6i);
Chris@10 823 T69 = FNMS(KP668178637, T5l, T5w);
Chris@10 824 T5x = FMA(KP668178637, T5w, T5l);
Chris@10 825 }
Chris@10 826 T9j = FMA(KP923879532, T9i, T9h);
Chris@10 827 T9l = FNMS(KP923879532, T9i, T9h);
Chris@10 828 T9k = T6a - T69;
Chris@10 829 T6b = T69 + T6a;
Chris@10 830 T9m = T5x + T66;
Chris@10 831 T67 = T5x - T66;
Chris@10 832 ii[WS(rs, 21)] = FNMS(KP831469612, T9k, T9j);
Chris@10 833 ii[WS(rs, 5)] = FMA(KP831469612, T9k, T9j);
Chris@10 834 ri[WS(rs, 5)] = FMA(KP831469612, T67, T4Y);
Chris@10 835 ri[WS(rs, 21)] = FNMS(KP831469612, T67, T4Y);
Chris@10 836 ri[WS(rs, 29)] = FMA(KP831469612, T6b, T68);
Chris@10 837 ri[WS(rs, 13)] = FNMS(KP831469612, T6b, T68);
Chris@10 838 }
Chris@10 839 {
Chris@10 840 E T6o, T9d, T9e, T6r, T6g, T6n;
Chris@10 841 T6o = FNMS(KP923879532, T6f, T6c);
Chris@10 842 T6g = FMA(KP923879532, T6f, T6c);
Chris@10 843 T6n = T6j + T6m;
Chris@10 844 T9g = T6m - T6j;
Chris@10 845 T9f = FNMS(KP923879532, T9c, T9b);
Chris@10 846 T9d = FMA(KP923879532, T9c, T9b);
Chris@10 847 ii[WS(rs, 29)] = FMA(KP831469612, T9m, T9l);
Chris@10 848 ii[WS(rs, 13)] = FNMS(KP831469612, T9m, T9l);
Chris@10 849 ri[WS(rs, 1)] = FMA(KP980785280, T6n, T6g);
Chris@10 850 ri[WS(rs, 17)] = FNMS(KP980785280, T6n, T6g);
Chris@10 851 T9e = T6p + T6q;
Chris@10 852 T6r = T6p - T6q;
Chris@10 853 ii[WS(rs, 17)] = FNMS(KP980785280, T9e, T9d);
Chris@10 854 ii[WS(rs, 1)] = FMA(KP980785280, T9e, T9d);
Chris@10 855 ri[WS(rs, 9)] = FMA(KP980785280, T6r, T6o);
Chris@10 856 ri[WS(rs, 25)] = FNMS(KP980785280, T6r, T6o);
Chris@10 857 }
Chris@10 858 }
Chris@10 859 }
Chris@10 860 {
Chris@10 861 E T6Y, T6w, T9w, T6D, T9v, T9p, T9q, T71, T6H, T74, T78, T7c, T6W, T6S;
Chris@10 862 {
Chris@10 863 E T6Z, T6z, T6C, T70;
Chris@10 864 T6Z = FNMS(KP414213562, T6x, T6y);
Chris@10 865 T6z = FMA(KP414213562, T6y, T6x);
Chris@10 866 ii[WS(rs, 25)] = FNMS(KP980785280, T9g, T9f);
Chris@10 867 ii[WS(rs, 9)] = FMA(KP980785280, T9g, T9f);
Chris@10 868 T6Y = FNMS(KP707106781, T6v, T6s);
Chris@10 869 T6w = FMA(KP707106781, T6v, T6s);
Chris@10 870 T6C = FNMS(KP414213562, T6B, T6A);
Chris@10 871 T70 = FMA(KP414213562, T6A, T6B);
Chris@10 872 T9w = T6z + T6C;
Chris@10 873 T6D = T6z - T6C;
Chris@10 874 T9v = FNMS(KP707106781, T9o, T9n);
Chris@10 875 T9p = FMA(KP707106781, T9o, T9n);
Chris@10 876 {
Chris@10 877 E T77, T6O, T76, T6R;
Chris@10 878 T9q = T70 - T6Z;
Chris@10 879 T71 = T6Z + T70;
Chris@10 880 T77 = FMA(KP707106781, T6N, T6M);
Chris@10 881 T6O = FNMS(KP707106781, T6N, T6M);
Chris@10 882 T76 = FMA(KP707106781, T6Q, T6P);
Chris@10 883 T6R = FNMS(KP707106781, T6Q, T6P);
Chris@10 884 T6H = FNMS(KP707106781, T6G, T6F);
Chris@10 885 T74 = FMA(KP707106781, T6G, T6F);
Chris@10 886 T78 = FNMS(KP198912367, T77, T76);
Chris@10 887 T7c = FMA(KP198912367, T76, T77);
Chris@10 888 T6W = FMA(KP668178637, T6O, T6R);
Chris@10 889 T6S = FNMS(KP668178637, T6R, T6O);
Chris@10 890 }
Chris@10 891 }
Chris@10 892 {
Chris@10 893 E T6U, T6E, T9r, T9t, T73, T6K;
Chris@10 894 T6U = FNMS(KP923879532, T6D, T6w);
Chris@10 895 T6E = FMA(KP923879532, T6D, T6w);
Chris@10 896 T9r = FMA(KP923879532, T9q, T9p);
Chris@10 897 T9t = FNMS(KP923879532, T9q, T9p);
Chris@10 898 T73 = FMA(KP707106781, T6J, T6I);
Chris@10 899 T6K = FNMS(KP707106781, T6J, T6I);
Chris@10 900 {
Chris@10 901 E T7a, T9x, T9y, T7d;
Chris@10 902 {
Chris@10 903 E T72, T7b, T6V, T6L, T79, T75;
Chris@10 904 T7a = FMA(KP923879532, T71, T6Y);
Chris@10 905 T72 = FNMS(KP923879532, T71, T6Y);
Chris@10 906 T75 = FMA(KP198912367, T74, T73);
Chris@10 907 T7b = FNMS(KP198912367, T73, T74);
Chris@10 908 T6V = FNMS(KP668178637, T6H, T6K);
Chris@10 909 T6L = FMA(KP668178637, T6K, T6H);
Chris@10 910 T79 = T75 - T78;
Chris@10 911 T9A = T75 + T78;
Chris@10 912 T9z = FMA(KP923879532, T9w, T9v);
Chris@10 913 T9x = FNMS(KP923879532, T9w, T9v);
Chris@10 914 {
Chris@10 915 E T6X, T9s, T9u, T6T;
Chris@10 916 T6X = T6V - T6W;
Chris@10 917 T9s = T6V + T6W;
Chris@10 918 T9u = T6S - T6L;
Chris@10 919 T6T = T6L + T6S;
Chris@10 920 ri[WS(rs, 7)] = FMA(KP980785280, T79, T72);
Chris@10 921 ri[WS(rs, 23)] = FNMS(KP980785280, T79, T72);
Chris@10 922 ri[WS(rs, 11)] = FMA(KP831469612, T6X, T6U);
Chris@10 923 ri[WS(rs, 27)] = FNMS(KP831469612, T6X, T6U);
Chris@10 924 ii[WS(rs, 19)] = FNMS(KP831469612, T9s, T9r);
Chris@10 925 ii[WS(rs, 3)] = FMA(KP831469612, T9s, T9r);
Chris@10 926 ii[WS(rs, 27)] = FNMS(KP831469612, T9u, T9t);
Chris@10 927 ii[WS(rs, 11)] = FMA(KP831469612, T9u, T9t);
Chris@10 928 ri[WS(rs, 3)] = FMA(KP831469612, T6T, T6E);
Chris@10 929 ri[WS(rs, 19)] = FNMS(KP831469612, T6T, T6E);
Chris@10 930 T9y = T7c - T7b;
Chris@10 931 T7d = T7b + T7c;
Chris@10 932 }
Chris@10 933 }
Chris@10 934 ii[WS(rs, 23)] = FNMS(KP980785280, T9y, T9x);
Chris@10 935 ii[WS(rs, 7)] = FMA(KP980785280, T9y, T9x);
Chris@10 936 ri[WS(rs, 31)] = FMA(KP980785280, T7d, T7a);
Chris@10 937 ri[WS(rs, 15)] = FNMS(KP980785280, T7d, T7a);
Chris@10 938 }
Chris@10 939 }
Chris@10 940 }
Chris@10 941 }
Chris@10 942 }
Chris@10 943 }
Chris@10 944 }
Chris@10 945 ii[WS(rs, 31)] = FMA(KP980785280, T9A, T9z);
Chris@10 946 ii[WS(rs, 15)] = FNMS(KP980785280, T9A, T9z);
Chris@10 947 }
Chris@10 948 }
Chris@10 949 }
Chris@10 950
Chris@10 951 static const tw_instr twinstr[] = {
Chris@10 952 {TW_CEXP, 0, 1},
Chris@10 953 {TW_CEXP, 0, 3},
Chris@10 954 {TW_CEXP, 0, 9},
Chris@10 955 {TW_CEXP, 0, 27},
Chris@10 956 {TW_NEXT, 1, 0}
Chris@10 957 };
Chris@10 958
Chris@10 959 static const ct_desc desc = { 32, "t2_32", twinstr, &GENUS, {236, 98, 252, 0}, 0, 0, 0 };
Chris@10 960
Chris@10 961 void X(codelet_t2_32) (planner *p) {
Chris@10 962 X(kdft_dit_register) (p, t2_32, &desc);
Chris@10 963 }
Chris@10 964 #else /* HAVE_FMA */
Chris@10 965
Chris@10 966 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -name t2_32 -include t.h */
Chris@10 967
Chris@10 968 /*
Chris@10 969 * This function contains 488 FP additions, 280 FP multiplications,
Chris@10 970 * (or, 376 additions, 168 multiplications, 112 fused multiply/add),
Chris@10 971 * 158 stack variables, 7 constants, and 128 memory accesses
Chris@10 972 */
Chris@10 973 #include "t.h"
Chris@10 974
Chris@10 975 static void t2_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 976 {
Chris@10 977 DK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@10 978 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 979 DK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@10 980 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 981 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 982 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 983 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 984 {
Chris@10 985 INT m;
Chris@10 986 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 987 E T2, T5, T3, T6, T8, TM, TO, Td, T9, Te, Th, Tl, TD, TH, T1y;
Chris@10 988 E T1H, T15, T1A, T11, T1F, T1n, T1p, T2q, T2I, T2u, T2K, T2V, T3b, T2Z, T3d;
Chris@10 989 E Tu, Ty, T3l, T3n, T1t, T1v, T2f, T2h, T1a, T1e, T32, T34, T1W, T1Y, T2C;
Chris@10 990 E T2E, Tg, TR, Tk, TS, Tm, TV, To, TT, T1M, T21, T1P, T22, T1Q, T25;
Chris@10 991 E T1S, T23;
Chris@10 992 {
Chris@10 993 E Ts, T1d, Tx, T18, Tt, T1c, Tw, T19, TB, T14, TG, TZ, TC, T13, TF;
Chris@10 994 E T10;
Chris@10 995 {
Chris@10 996 E T4, Tc, T7, Tb;
Chris@10 997 T2 = W[0];
Chris@10 998 T5 = W[1];
Chris@10 999 T3 = W[2];
Chris@10 1000 T6 = W[3];
Chris@10 1001 T4 = T2 * T3;
Chris@10 1002 Tc = T5 * T3;
Chris@10 1003 T7 = T5 * T6;
Chris@10 1004 Tb = T2 * T6;
Chris@10 1005 T8 = T4 + T7;
Chris@10 1006 TM = T4 - T7;
Chris@10 1007 TO = Tb + Tc;
Chris@10 1008 Td = Tb - Tc;
Chris@10 1009 T9 = W[4];
Chris@10 1010 Ts = T2 * T9;
Chris@10 1011 T1d = T6 * T9;
Chris@10 1012 Tx = T5 * T9;
Chris@10 1013 T18 = T3 * T9;
Chris@10 1014 Te = W[5];
Chris@10 1015 Tt = T5 * Te;
Chris@10 1016 T1c = T3 * Te;
Chris@10 1017 Tw = T2 * Te;
Chris@10 1018 T19 = T6 * Te;
Chris@10 1019 Th = W[6];
Chris@10 1020 TB = T3 * Th;
Chris@10 1021 T14 = T5 * Th;
Chris@10 1022 TG = T6 * Th;
Chris@10 1023 TZ = T2 * Th;
Chris@10 1024 Tl = W[7];
Chris@10 1025 TC = T6 * Tl;
Chris@10 1026 T13 = T2 * Tl;
Chris@10 1027 TF = T3 * Tl;
Chris@10 1028 T10 = T5 * Tl;
Chris@10 1029 }
Chris@10 1030 TD = TB + TC;
Chris@10 1031 TH = TF - TG;
Chris@10 1032 T1y = TZ + T10;
Chris@10 1033 T1H = TF + TG;
Chris@10 1034 T15 = T13 + T14;
Chris@10 1035 T1A = T13 - T14;
Chris@10 1036 T11 = TZ - T10;
Chris@10 1037 T1F = TB - TC;
Chris@10 1038 T1n = FMA(T9, Th, Te * Tl);
Chris@10 1039 T1p = FNMS(Te, Th, T9 * Tl);
Chris@10 1040 {
Chris@10 1041 E T2o, T2p, T2s, T2t;
Chris@10 1042 T2o = T8 * Th;
Chris@10 1043 T2p = Td * Tl;
Chris@10 1044 T2q = T2o + T2p;
Chris@10 1045 T2I = T2o - T2p;
Chris@10 1046 T2s = T8 * Tl;
Chris@10 1047 T2t = Td * Th;
Chris@10 1048 T2u = T2s - T2t;
Chris@10 1049 T2K = T2s + T2t;
Chris@10 1050 }
Chris@10 1051 {
Chris@10 1052 E T2T, T2U, T2X, T2Y;
Chris@10 1053 T2T = TM * Th;
Chris@10 1054 T2U = TO * Tl;
Chris@10 1055 T2V = T2T - T2U;
Chris@10 1056 T3b = T2T + T2U;
Chris@10 1057 T2X = TM * Tl;
Chris@10 1058 T2Y = TO * Th;
Chris@10 1059 T2Z = T2X + T2Y;
Chris@10 1060 T3d = T2X - T2Y;
Chris@10 1061 Tu = Ts + Tt;
Chris@10 1062 Ty = Tw - Tx;
Chris@10 1063 T3l = FMA(Tu, Th, Ty * Tl);
Chris@10 1064 T3n = FNMS(Ty, Th, Tu * Tl);
Chris@10 1065 }
Chris@10 1066 T1t = Ts - Tt;
Chris@10 1067 T1v = Tw + Tx;
Chris@10 1068 T2f = FMA(T1t, Th, T1v * Tl);
Chris@10 1069 T2h = FNMS(T1v, Th, T1t * Tl);
Chris@10 1070 T1a = T18 - T19;
Chris@10 1071 T1e = T1c + T1d;
Chris@10 1072 T32 = FMA(T1a, Th, T1e * Tl);
Chris@10 1073 T34 = FNMS(T1e, Th, T1a * Tl);
Chris@10 1074 T1W = T18 + T19;
Chris@10 1075 T1Y = T1c - T1d;
Chris@10 1076 T2C = FMA(T1W, Th, T1Y * Tl);
Chris@10 1077 T2E = FNMS(T1Y, Th, T1W * Tl);
Chris@10 1078 {
Chris@10 1079 E Ta, Tf, Ti, Tj;
Chris@10 1080 Ta = T8 * T9;
Chris@10 1081 Tf = Td * Te;
Chris@10 1082 Tg = Ta - Tf;
Chris@10 1083 TR = Ta + Tf;
Chris@10 1084 Ti = T8 * Te;
Chris@10 1085 Tj = Td * T9;
Chris@10 1086 Tk = Ti + Tj;
Chris@10 1087 TS = Ti - Tj;
Chris@10 1088 }
Chris@10 1089 Tm = FMA(Tg, Th, Tk * Tl);
Chris@10 1090 TV = FNMS(TS, Th, TR * Tl);
Chris@10 1091 To = FNMS(Tk, Th, Tg * Tl);
Chris@10 1092 TT = FMA(TR, Th, TS * Tl);
Chris@10 1093 {
Chris@10 1094 E T1K, T1L, T1N, T1O;
Chris@10 1095 T1K = TM * T9;
Chris@10 1096 T1L = TO * Te;
Chris@10 1097 T1M = T1K - T1L;
Chris@10 1098 T21 = T1K + T1L;
Chris@10 1099 T1N = TM * Te;
Chris@10 1100 T1O = TO * T9;
Chris@10 1101 T1P = T1N + T1O;
Chris@10 1102 T22 = T1N - T1O;
Chris@10 1103 }
Chris@10 1104 T1Q = FMA(T1M, Th, T1P * Tl);
Chris@10 1105 T25 = FNMS(T22, Th, T21 * Tl);
Chris@10 1106 T1S = FNMS(T1P, Th, T1M * Tl);
Chris@10 1107 T23 = FMA(T21, Th, T22 * Tl);
Chris@10 1108 }
Chris@10 1109 {
Chris@10 1110 E TL, T6f, T8c, T8q, T3F, T5t, T7I, T7W, T2y, T6B, T6y, T7j, T4k, T5J, T4B;
Chris@10 1111 E T5G, T3h, T6H, T6O, T7o, T4L, T5N, T52, T5Q, T1i, T7V, T6i, T7D, T3K, T5u;
Chris@10 1112 E T3P, T5v, T1E, T6n, T6m, T7e, T3W, T5y, T41, T5z, T29, T6p, T6s, T7f, T47;
Chris@10 1113 E T5B, T4c, T5C, T2R, T6z, T6E, T7k, T4v, T5H, T4E, T5K, T3y, T6P, T6K, T7p;
Chris@10 1114 E T4W, T5R, T55, T5O;
Chris@10 1115 {
Chris@10 1116 E T1, T7G, Tq, T7F, TA, T3C, TJ, T3D, Tn, Tp;
Chris@10 1117 T1 = ri[0];
Chris@10 1118 T7G = ii[0];
Chris@10 1119 Tn = ri[WS(rs, 16)];
Chris@10 1120 Tp = ii[WS(rs, 16)];
Chris@10 1121 Tq = FMA(Tm, Tn, To * Tp);
Chris@10 1122 T7F = FNMS(To, Tn, Tm * Tp);
Chris@10 1123 {
Chris@10 1124 E Tv, Tz, TE, TI;
Chris@10 1125 Tv = ri[WS(rs, 8)];
Chris@10 1126 Tz = ii[WS(rs, 8)];
Chris@10 1127 TA = FMA(Tu, Tv, Ty * Tz);
Chris@10 1128 T3C = FNMS(Ty, Tv, Tu * Tz);
Chris@10 1129 TE = ri[WS(rs, 24)];
Chris@10 1130 TI = ii[WS(rs, 24)];
Chris@10 1131 TJ = FMA(TD, TE, TH * TI);
Chris@10 1132 T3D = FNMS(TH, TE, TD * TI);
Chris@10 1133 }
Chris@10 1134 {
Chris@10 1135 E Tr, TK, T8a, T8b;
Chris@10 1136 Tr = T1 + Tq;
Chris@10 1137 TK = TA + TJ;
Chris@10 1138 TL = Tr + TK;
Chris@10 1139 T6f = Tr - TK;
Chris@10 1140 T8a = T7G - T7F;
Chris@10 1141 T8b = TA - TJ;
Chris@10 1142 T8c = T8a - T8b;
Chris@10 1143 T8q = T8b + T8a;
Chris@10 1144 }
Chris@10 1145 {
Chris@10 1146 E T3B, T3E, T7E, T7H;
Chris@10 1147 T3B = T1 - Tq;
Chris@10 1148 T3E = T3C - T3D;
Chris@10 1149 T3F = T3B - T3E;
Chris@10 1150 T5t = T3B + T3E;
Chris@10 1151 T7E = T3C + T3D;
Chris@10 1152 T7H = T7F + T7G;
Chris@10 1153 T7I = T7E + T7H;
Chris@10 1154 T7W = T7H - T7E;
Chris@10 1155 }
Chris@10 1156 }
Chris@10 1157 {
Chris@10 1158 E T2e, T4g, T2w, T4z, T2j, T4h, T2n, T4y;
Chris@10 1159 {
Chris@10 1160 E T2c, T2d, T2r, T2v;
Chris@10 1161 T2c = ri[WS(rs, 1)];
Chris@10 1162 T2d = ii[WS(rs, 1)];
Chris@10 1163 T2e = FMA(T2, T2c, T5 * T2d);
Chris@10 1164 T4g = FNMS(T5, T2c, T2 * T2d);
Chris@10 1165 T2r = ri[WS(rs, 25)];
Chris@10 1166 T2v = ii[WS(rs, 25)];
Chris@10 1167 T2w = FMA(T2q, T2r, T2u * T2v);
Chris@10 1168 T4z = FNMS(T2u, T2r, T2q * T2v);
Chris@10 1169 }
Chris@10 1170 {
Chris@10 1171 E T2g, T2i, T2l, T2m;
Chris@10 1172 T2g = ri[WS(rs, 17)];
Chris@10 1173 T2i = ii[WS(rs, 17)];
Chris@10 1174 T2j = FMA(T2f, T2g, T2h * T2i);
Chris@10 1175 T4h = FNMS(T2h, T2g, T2f * T2i);
Chris@10 1176 T2l = ri[WS(rs, 9)];
Chris@10 1177 T2m = ii[WS(rs, 9)];
Chris@10 1178 T2n = FMA(T9, T2l, Te * T2m);
Chris@10 1179 T4y = FNMS(Te, T2l, T9 * T2m);
Chris@10 1180 }
Chris@10 1181 {
Chris@10 1182 E T2k, T2x, T6w, T6x;
Chris@10 1183 T2k = T2e + T2j;
Chris@10 1184 T2x = T2n + T2w;
Chris@10 1185 T2y = T2k + T2x;
Chris@10 1186 T6B = T2k - T2x;
Chris@10 1187 T6w = T4g + T4h;
Chris@10 1188 T6x = T4y + T4z;
Chris@10 1189 T6y = T6w - T6x;
Chris@10 1190 T7j = T6w + T6x;
Chris@10 1191 }
Chris@10 1192 {
Chris@10 1193 E T4i, T4j, T4x, T4A;
Chris@10 1194 T4i = T4g - T4h;
Chris@10 1195 T4j = T2n - T2w;
Chris@10 1196 T4k = T4i + T4j;
Chris@10 1197 T5J = T4i - T4j;
Chris@10 1198 T4x = T2e - T2j;
Chris@10 1199 T4A = T4y - T4z;
Chris@10 1200 T4B = T4x - T4A;
Chris@10 1201 T5G = T4x + T4A;
Chris@10 1202 }
Chris@10 1203 }
Chris@10 1204 {
Chris@10 1205 E T31, T4Y, T3f, T4J, T36, T4Z, T3a, T4I;
Chris@10 1206 {
Chris@10 1207 E T2W, T30, T3c, T3e;
Chris@10 1208 T2W = ri[WS(rs, 31)];
Chris@10 1209 T30 = ii[WS(rs, 31)];
Chris@10 1210 T31 = FMA(T2V, T2W, T2Z * T30);
Chris@10 1211 T4Y = FNMS(T2Z, T2W, T2V * T30);
Chris@10 1212 T3c = ri[WS(rs, 23)];
Chris@10 1213 T3e = ii[WS(rs, 23)];
Chris@10 1214 T3f = FMA(T3b, T3c, T3d * T3e);
Chris@10 1215 T4J = FNMS(T3d, T3c, T3b * T3e);
Chris@10 1216 }
Chris@10 1217 {
Chris@10 1218 E T33, T35, T38, T39;
Chris@10 1219 T33 = ri[WS(rs, 15)];
Chris@10 1220 T35 = ii[WS(rs, 15)];
Chris@10 1221 T36 = FMA(T32, T33, T34 * T35);
Chris@10 1222 T4Z = FNMS(T34, T33, T32 * T35);
Chris@10 1223 T38 = ri[WS(rs, 7)];
Chris@10 1224 T39 = ii[WS(rs, 7)];
Chris@10 1225 T3a = FMA(TR, T38, TS * T39);
Chris@10 1226 T4I = FNMS(TS, T38, TR * T39);
Chris@10 1227 }
Chris@10 1228 {
Chris@10 1229 E T37, T3g, T6M, T6N;
Chris@10 1230 T37 = T31 + T36;
Chris@10 1231 T3g = T3a + T3f;
Chris@10 1232 T3h = T37 + T3g;
Chris@10 1233 T6H = T37 - T3g;
Chris@10 1234 T6M = T4Y + T4Z;
Chris@10 1235 T6N = T4I + T4J;
Chris@10 1236 T6O = T6M - T6N;
Chris@10 1237 T7o = T6M + T6N;
Chris@10 1238 }
Chris@10 1239 {
Chris@10 1240 E T4H, T4K, T50, T51;
Chris@10 1241 T4H = T31 - T36;
Chris@10 1242 T4K = T4I - T4J;
Chris@10 1243 T4L = T4H - T4K;
Chris@10 1244 T5N = T4H + T4K;
Chris@10 1245 T50 = T4Y - T4Z;
Chris@10 1246 T51 = T3a - T3f;
Chris@10 1247 T52 = T50 + T51;
Chris@10 1248 T5Q = T50 - T51;
Chris@10 1249 }
Chris@10 1250 }
Chris@10 1251 {
Chris@10 1252 E TQ, T3G, T1g, T3N, TX, T3H, T17, T3M;
Chris@10 1253 {
Chris@10 1254 E TN, TP, T1b, T1f;
Chris@10 1255 TN = ri[WS(rs, 4)];
Chris@10 1256 TP = ii[WS(rs, 4)];
Chris@10 1257 TQ = FMA(TM, TN, TO * TP);
Chris@10 1258 T3G = FNMS(TO, TN, TM * TP);
Chris@10 1259 T1b = ri[WS(rs, 12)];
Chris@10 1260 T1f = ii[WS(rs, 12)];
Chris@10 1261 T1g = FMA(T1a, T1b, T1e * T1f);
Chris@10 1262 T3N = FNMS(T1e, T1b, T1a * T1f);
Chris@10 1263 }
Chris@10 1264 {
Chris@10 1265 E TU, TW, T12, T16;
Chris@10 1266 TU = ri[WS(rs, 20)];
Chris@10 1267 TW = ii[WS(rs, 20)];
Chris@10 1268 TX = FMA(TT, TU, TV * TW);
Chris@10 1269 T3H = FNMS(TV, TU, TT * TW);
Chris@10 1270 T12 = ri[WS(rs, 28)];
Chris@10 1271 T16 = ii[WS(rs, 28)];
Chris@10 1272 T17 = FMA(T11, T12, T15 * T16);
Chris@10 1273 T3M = FNMS(T15, T12, T11 * T16);
Chris@10 1274 }
Chris@10 1275 {
Chris@10 1276 E TY, T1h, T6g, T6h;
Chris@10 1277 TY = TQ + TX;
Chris@10 1278 T1h = T17 + T1g;
Chris@10 1279 T1i = TY + T1h;
Chris@10 1280 T7V = T1h - TY;
Chris@10 1281 T6g = T3G + T3H;
Chris@10 1282 T6h = T3M + T3N;
Chris@10 1283 T6i = T6g - T6h;
Chris@10 1284 T7D = T6g + T6h;
Chris@10 1285 }
Chris@10 1286 {
Chris@10 1287 E T3I, T3J, T3L, T3O;
Chris@10 1288 T3I = T3G - T3H;
Chris@10 1289 T3J = TQ - TX;
Chris@10 1290 T3K = T3I - T3J;
Chris@10 1291 T5u = T3J + T3I;
Chris@10 1292 T3L = T17 - T1g;
Chris@10 1293 T3O = T3M - T3N;
Chris@10 1294 T3P = T3L + T3O;
Chris@10 1295 T5v = T3L - T3O;
Chris@10 1296 }
Chris@10 1297 }
Chris@10 1298 {
Chris@10 1299 E T1m, T3S, T1C, T3Z, T1r, T3T, T1x, T3Y;
Chris@10 1300 {
Chris@10 1301 E T1k, T1l, T1z, T1B;
Chris@10 1302 T1k = ri[WS(rs, 2)];
Chris@10 1303 T1l = ii[WS(rs, 2)];
Chris@10 1304 T1m = FMA(T8, T1k, Td * T1l);
Chris@10 1305 T3S = FNMS(Td, T1k, T8 * T1l);
Chris@10 1306 T1z = ri[WS(rs, 26)];
Chris@10 1307 T1B = ii[WS(rs, 26)];
Chris@10 1308 T1C = FMA(T1y, T1z, T1A * T1B);
Chris@10 1309 T3Z = FNMS(T1A, T1z, T1y * T1B);
Chris@10 1310 }
Chris@10 1311 {
Chris@10 1312 E T1o, T1q, T1u, T1w;
Chris@10 1313 T1o = ri[WS(rs, 18)];
Chris@10 1314 T1q = ii[WS(rs, 18)];
Chris@10 1315 T1r = FMA(T1n, T1o, T1p * T1q);
Chris@10 1316 T3T = FNMS(T1p, T1o, T1n * T1q);
Chris@10 1317 T1u = ri[WS(rs, 10)];
Chris@10 1318 T1w = ii[WS(rs, 10)];
Chris@10 1319 T1x = FMA(T1t, T1u, T1v * T1w);
Chris@10 1320 T3Y = FNMS(T1v, T1u, T1t * T1w);
Chris@10 1321 }
Chris@10 1322 {
Chris@10 1323 E T1s, T1D, T6k, T6l;
Chris@10 1324 T1s = T1m + T1r;
Chris@10 1325 T1D = T1x + T1C;
Chris@10 1326 T1E = T1s + T1D;
Chris@10 1327 T6n = T1s - T1D;
Chris@10 1328 T6k = T3S + T3T;
Chris@10 1329 T6l = T3Y + T3Z;
Chris@10 1330 T6m = T6k - T6l;
Chris@10 1331 T7e = T6k + T6l;
Chris@10 1332 }
Chris@10 1333 {
Chris@10 1334 E T3U, T3V, T3X, T40;
Chris@10 1335 T3U = T3S - T3T;
Chris@10 1336 T3V = T1x - T1C;
Chris@10 1337 T3W = T3U + T3V;
Chris@10 1338 T5y = T3U - T3V;
Chris@10 1339 T3X = T1m - T1r;
Chris@10 1340 T40 = T3Y - T3Z;
Chris@10 1341 T41 = T3X - T40;
Chris@10 1342 T5z = T3X + T40;
Chris@10 1343 }
Chris@10 1344 }
Chris@10 1345 {
Chris@10 1346 E T1J, T43, T27, T4a, T1U, T44, T20, T49;
Chris@10 1347 {
Chris@10 1348 E T1G, T1I, T24, T26;
Chris@10 1349 T1G = ri[WS(rs, 30)];
Chris@10 1350 T1I = ii[WS(rs, 30)];
Chris@10 1351 T1J = FMA(T1F, T1G, T1H * T1I);
Chris@10 1352 T43 = FNMS(T1H, T1G, T1F * T1I);
Chris@10 1353 T24 = ri[WS(rs, 22)];
Chris@10 1354 T26 = ii[WS(rs, 22)];
Chris@10 1355 T27 = FMA(T23, T24, T25 * T26);
Chris@10 1356 T4a = FNMS(T25, T24, T23 * T26);
Chris@10 1357 }
Chris@10 1358 {
Chris@10 1359 E T1R, T1T, T1X, T1Z;
Chris@10 1360 T1R = ri[WS(rs, 14)];
Chris@10 1361 T1T = ii[WS(rs, 14)];
Chris@10 1362 T1U = FMA(T1Q, T1R, T1S * T1T);
Chris@10 1363 T44 = FNMS(T1S, T1R, T1Q * T1T);
Chris@10 1364 T1X = ri[WS(rs, 6)];
Chris@10 1365 T1Z = ii[WS(rs, 6)];
Chris@10 1366 T20 = FMA(T1W, T1X, T1Y * T1Z);
Chris@10 1367 T49 = FNMS(T1Y, T1X, T1W * T1Z);
Chris@10 1368 }
Chris@10 1369 {
Chris@10 1370 E T1V, T28, T6q, T6r;
Chris@10 1371 T1V = T1J + T1U;
Chris@10 1372 T28 = T20 + T27;
Chris@10 1373 T29 = T1V + T28;
Chris@10 1374 T6p = T1V - T28;
Chris@10 1375 T6q = T43 + T44;
Chris@10 1376 T6r = T49 + T4a;
Chris@10 1377 T6s = T6q - T6r;
Chris@10 1378 T7f = T6q + T6r;
Chris@10 1379 }
Chris@10 1380 {
Chris@10 1381 E T45, T46, T48, T4b;
Chris@10 1382 T45 = T43 - T44;
Chris@10 1383 T46 = T20 - T27;
Chris@10 1384 T47 = T45 + T46;
Chris@10 1385 T5B = T45 - T46;
Chris@10 1386 T48 = T1J - T1U;
Chris@10 1387 T4b = T49 - T4a;
Chris@10 1388 T4c = T48 - T4b;
Chris@10 1389 T5C = T48 + T4b;
Chris@10 1390 }
Chris@10 1391 }
Chris@10 1392 {
Chris@10 1393 E T2B, T4r, T2G, T4s, T4q, T4t, T2M, T4m, T2P, T4n, T4l, T4o;
Chris@10 1394 {
Chris@10 1395 E T2z, T2A, T2D, T2F;
Chris@10 1396 T2z = ri[WS(rs, 5)];
Chris@10 1397 T2A = ii[WS(rs, 5)];
Chris@10 1398 T2B = FMA(T21, T2z, T22 * T2A);
Chris@10 1399 T4r = FNMS(T22, T2z, T21 * T2A);
Chris@10 1400 T2D = ri[WS(rs, 21)];
Chris@10 1401 T2F = ii[WS(rs, 21)];
Chris@10 1402 T2G = FMA(T2C, T2D, T2E * T2F);
Chris@10 1403 T4s = FNMS(T2E, T2D, T2C * T2F);
Chris@10 1404 }
Chris@10 1405 T4q = T2B - T2G;
Chris@10 1406 T4t = T4r - T4s;
Chris@10 1407 {
Chris@10 1408 E T2J, T2L, T2N, T2O;
Chris@10 1409 T2J = ri[WS(rs, 29)];
Chris@10 1410 T2L = ii[WS(rs, 29)];
Chris@10 1411 T2M = FMA(T2I, T2J, T2K * T2L);
Chris@10 1412 T4m = FNMS(T2K, T2J, T2I * T2L);
Chris@10 1413 T2N = ri[WS(rs, 13)];
Chris@10 1414 T2O = ii[WS(rs, 13)];
Chris@10 1415 T2P = FMA(T1M, T2N, T1P * T2O);
Chris@10 1416 T4n = FNMS(T1P, T2N, T1M * T2O);
Chris@10 1417 }
Chris@10 1418 T4l = T2M - T2P;
Chris@10 1419 T4o = T4m - T4n;
Chris@10 1420 {
Chris@10 1421 E T2H, T2Q, T6C, T6D;
Chris@10 1422 T2H = T2B + T2G;
Chris@10 1423 T2Q = T2M + T2P;
Chris@10 1424 T2R = T2H + T2Q;
Chris@10 1425 T6z = T2Q - T2H;
Chris@10 1426 T6C = T4r + T4s;
Chris@10 1427 T6D = T4m + T4n;
Chris@10 1428 T6E = T6C - T6D;
Chris@10 1429 T7k = T6C + T6D;
Chris@10 1430 }
Chris@10 1431 {
Chris@10 1432 E T4p, T4u, T4C, T4D;
Chris@10 1433 T4p = T4l - T4o;
Chris@10 1434 T4u = T4q + T4t;
Chris@10 1435 T4v = KP707106781 * (T4p - T4u);
Chris@10 1436 T5H = KP707106781 * (T4u + T4p);
Chris@10 1437 T4C = T4t - T4q;
Chris@10 1438 T4D = T4l + T4o;
Chris@10 1439 T4E = KP707106781 * (T4C - T4D);
Chris@10 1440 T5K = KP707106781 * (T4C + T4D);
Chris@10 1441 }
Chris@10 1442 }
Chris@10 1443 {
Chris@10 1444 E T3k, T4M, T3p, T4N, T4O, T4P, T3t, T4S, T3w, T4T, T4R, T4U;
Chris@10 1445 {
Chris@10 1446 E T3i, T3j, T3m, T3o;
Chris@10 1447 T3i = ri[WS(rs, 3)];
Chris@10 1448 T3j = ii[WS(rs, 3)];
Chris@10 1449 T3k = FMA(T3, T3i, T6 * T3j);
Chris@10 1450 T4M = FNMS(T6, T3i, T3 * T3j);
Chris@10 1451 T3m = ri[WS(rs, 19)];
Chris@10 1452 T3o = ii[WS(rs, 19)];
Chris@10 1453 T3p = FMA(T3l, T3m, T3n * T3o);
Chris@10 1454 T4N = FNMS(T3n, T3m, T3l * T3o);
Chris@10 1455 }
Chris@10 1456 T4O = T4M - T4N;
Chris@10 1457 T4P = T3k - T3p;
Chris@10 1458 {
Chris@10 1459 E T3r, T3s, T3u, T3v;
Chris@10 1460 T3r = ri[WS(rs, 27)];
Chris@10 1461 T3s = ii[WS(rs, 27)];
Chris@10 1462 T3t = FMA(Th, T3r, Tl * T3s);
Chris@10 1463 T4S = FNMS(Tl, T3r, Th * T3s);
Chris@10 1464 T3u = ri[WS(rs, 11)];
Chris@10 1465 T3v = ii[WS(rs, 11)];
Chris@10 1466 T3w = FMA(Tg, T3u, Tk * T3v);
Chris@10 1467 T4T = FNMS(Tk, T3u, Tg * T3v);
Chris@10 1468 }
Chris@10 1469 T4R = T3t - T3w;
Chris@10 1470 T4U = T4S - T4T;
Chris@10 1471 {
Chris@10 1472 E T3q, T3x, T6I, T6J;
Chris@10 1473 T3q = T3k + T3p;
Chris@10 1474 T3x = T3t + T3w;
Chris@10 1475 T3y = T3q + T3x;
Chris@10 1476 T6P = T3x - T3q;
Chris@10 1477 T6I = T4M + T4N;
Chris@10 1478 T6J = T4S + T4T;
Chris@10 1479 T6K = T6I - T6J;
Chris@10 1480 T7p = T6I + T6J;
Chris@10 1481 }
Chris@10 1482 {
Chris@10 1483 E T4Q, T4V, T53, T54;
Chris@10 1484 T4Q = T4O - T4P;
Chris@10 1485 T4V = T4R + T4U;
Chris@10 1486 T4W = KP707106781 * (T4Q - T4V);
Chris@10 1487 T5R = KP707106781 * (T4Q + T4V);
Chris@10 1488 T53 = T4R - T4U;
Chris@10 1489 T54 = T4P + T4O;
Chris@10 1490 T55 = KP707106781 * (T53 - T54);
Chris@10 1491 T5O = KP707106781 * (T54 + T53);
Chris@10 1492 }
Chris@10 1493 }
Chris@10 1494 {
Chris@10 1495 E T2b, T7x, T7K, T7M, T3A, T7L, T7A, T7B;
Chris@10 1496 {
Chris@10 1497 E T1j, T2a, T7C, T7J;
Chris@10 1498 T1j = TL + T1i;
Chris@10 1499 T2a = T1E + T29;
Chris@10 1500 T2b = T1j + T2a;
Chris@10 1501 T7x = T1j - T2a;
Chris@10 1502 T7C = T7e + T7f;
Chris@10 1503 T7J = T7D + T7I;
Chris@10 1504 T7K = T7C + T7J;
Chris@10 1505 T7M = T7J - T7C;
Chris@10 1506 }
Chris@10 1507 {
Chris@10 1508 E T2S, T3z, T7y, T7z;
Chris@10 1509 T2S = T2y + T2R;
Chris@10 1510 T3z = T3h + T3y;
Chris@10 1511 T3A = T2S + T3z;
Chris@10 1512 T7L = T3z - T2S;
Chris@10 1513 T7y = T7j + T7k;
Chris@10 1514 T7z = T7o + T7p;
Chris@10 1515 T7A = T7y - T7z;
Chris@10 1516 T7B = T7y + T7z;
Chris@10 1517 }
Chris@10 1518 ri[WS(rs, 16)] = T2b - T3A;
Chris@10 1519 ii[WS(rs, 16)] = T7K - T7B;
Chris@10 1520 ri[0] = T2b + T3A;
Chris@10 1521 ii[0] = T7B + T7K;
Chris@10 1522 ri[WS(rs, 24)] = T7x - T7A;
Chris@10 1523 ii[WS(rs, 24)] = T7M - T7L;
Chris@10 1524 ri[WS(rs, 8)] = T7x + T7A;
Chris@10 1525 ii[WS(rs, 8)] = T7L + T7M;
Chris@10 1526 }
Chris@10 1527 {
Chris@10 1528 E T7h, T7t, T7Q, T7S, T7m, T7u, T7r, T7v;
Chris@10 1529 {
Chris@10 1530 E T7d, T7g, T7O, T7P;
Chris@10 1531 T7d = TL - T1i;
Chris@10 1532 T7g = T7e - T7f;
Chris@10 1533 T7h = T7d + T7g;
Chris@10 1534 T7t = T7d - T7g;
Chris@10 1535 T7O = T29 - T1E;
Chris@10 1536 T7P = T7I - T7D;
Chris@10 1537 T7Q = T7O + T7P;
Chris@10 1538 T7S = T7P - T7O;
Chris@10 1539 }
Chris@10 1540 {
Chris@10 1541 E T7i, T7l, T7n, T7q;
Chris@10 1542 T7i = T2y - T2R;
Chris@10 1543 T7l = T7j - T7k;
Chris@10 1544 T7m = T7i + T7l;
Chris@10 1545 T7u = T7l - T7i;
Chris@10 1546 T7n = T3h - T3y;
Chris@10 1547 T7q = T7o - T7p;
Chris@10 1548 T7r = T7n - T7q;
Chris@10 1549 T7v = T7n + T7q;
Chris@10 1550 }
Chris@10 1551 {
Chris@10 1552 E T7s, T7N, T7w, T7R;
Chris@10 1553 T7s = KP707106781 * (T7m + T7r);
Chris@10 1554 ri[WS(rs, 20)] = T7h - T7s;
Chris@10 1555 ri[WS(rs, 4)] = T7h + T7s;
Chris@10 1556 T7N = KP707106781 * (T7u + T7v);
Chris@10 1557 ii[WS(rs, 4)] = T7N + T7Q;
Chris@10 1558 ii[WS(rs, 20)] = T7Q - T7N;
Chris@10 1559 T7w = KP707106781 * (T7u - T7v);
Chris@10 1560 ri[WS(rs, 28)] = T7t - T7w;
Chris@10 1561 ri[WS(rs, 12)] = T7t + T7w;
Chris@10 1562 T7R = KP707106781 * (T7r - T7m);
Chris@10 1563 ii[WS(rs, 12)] = T7R + T7S;
Chris@10 1564 ii[WS(rs, 28)] = T7S - T7R;
Chris@10 1565 }
Chris@10 1566 }
Chris@10 1567 {
Chris@10 1568 E T6j, T7X, T83, T6X, T6u, T7U, T77, T7b, T70, T82, T6G, T6U, T74, T7a, T6R;
Chris@10 1569 E T6V;
Chris@10 1570 {
Chris@10 1571 E T6o, T6t, T6A, T6F;
Chris@10 1572 T6j = T6f - T6i;
Chris@10 1573 T7X = T7V + T7W;
Chris@10 1574 T83 = T7W - T7V;
Chris@10 1575 T6X = T6f + T6i;
Chris@10 1576 T6o = T6m - T6n;
Chris@10 1577 T6t = T6p + T6s;
Chris@10 1578 T6u = KP707106781 * (T6o - T6t);
Chris@10 1579 T7U = KP707106781 * (T6o + T6t);
Chris@10 1580 {
Chris@10 1581 E T75, T76, T6Y, T6Z;
Chris@10 1582 T75 = T6H + T6K;
Chris@10 1583 T76 = T6O + T6P;
Chris@10 1584 T77 = FNMS(KP382683432, T76, KP923879532 * T75);
Chris@10 1585 T7b = FMA(KP923879532, T76, KP382683432 * T75);
Chris@10 1586 T6Y = T6n + T6m;
Chris@10 1587 T6Z = T6p - T6s;
Chris@10 1588 T70 = KP707106781 * (T6Y + T6Z);
Chris@10 1589 T82 = KP707106781 * (T6Z - T6Y);
Chris@10 1590 }
Chris@10 1591 T6A = T6y - T6z;
Chris@10 1592 T6F = T6B - T6E;
Chris@10 1593 T6G = FMA(KP923879532, T6A, KP382683432 * T6F);
Chris@10 1594 T6U = FNMS(KP923879532, T6F, KP382683432 * T6A);
Chris@10 1595 {
Chris@10 1596 E T72, T73, T6L, T6Q;
Chris@10 1597 T72 = T6y + T6z;
Chris@10 1598 T73 = T6B + T6E;
Chris@10 1599 T74 = FMA(KP382683432, T72, KP923879532 * T73);
Chris@10 1600 T7a = FNMS(KP382683432, T73, KP923879532 * T72);
Chris@10 1601 T6L = T6H - T6K;
Chris@10 1602 T6Q = T6O - T6P;
Chris@10 1603 T6R = FNMS(KP923879532, T6Q, KP382683432 * T6L);
Chris@10 1604 T6V = FMA(KP382683432, T6Q, KP923879532 * T6L);
Chris@10 1605 }
Chris@10 1606 }
Chris@10 1607 {
Chris@10 1608 E T6v, T6S, T81, T84;
Chris@10 1609 T6v = T6j + T6u;
Chris@10 1610 T6S = T6G + T6R;
Chris@10 1611 ri[WS(rs, 22)] = T6v - T6S;
Chris@10 1612 ri[WS(rs, 6)] = T6v + T6S;
Chris@10 1613 T81 = T6U + T6V;
Chris@10 1614 T84 = T82 + T83;
Chris@10 1615 ii[WS(rs, 6)] = T81 + T84;
Chris@10 1616 ii[WS(rs, 22)] = T84 - T81;
Chris@10 1617 }
Chris@10 1618 {
Chris@10 1619 E T6T, T6W, T85, T86;
Chris@10 1620 T6T = T6j - T6u;
Chris@10 1621 T6W = T6U - T6V;
Chris@10 1622 ri[WS(rs, 30)] = T6T - T6W;
Chris@10 1623 ri[WS(rs, 14)] = T6T + T6W;
Chris@10 1624 T85 = T6R - T6G;
Chris@10 1625 T86 = T83 - T82;
Chris@10 1626 ii[WS(rs, 14)] = T85 + T86;
Chris@10 1627 ii[WS(rs, 30)] = T86 - T85;
Chris@10 1628 }
Chris@10 1629 {
Chris@10 1630 E T71, T78, T7T, T7Y;
Chris@10 1631 T71 = T6X + T70;
Chris@10 1632 T78 = T74 + T77;
Chris@10 1633 ri[WS(rs, 18)] = T71 - T78;
Chris@10 1634 ri[WS(rs, 2)] = T71 + T78;
Chris@10 1635 T7T = T7a + T7b;
Chris@10 1636 T7Y = T7U + T7X;
Chris@10 1637 ii[WS(rs, 2)] = T7T + T7Y;
Chris@10 1638 ii[WS(rs, 18)] = T7Y - T7T;
Chris@10 1639 }
Chris@10 1640 {
Chris@10 1641 E T79, T7c, T7Z, T80;
Chris@10 1642 T79 = T6X - T70;
Chris@10 1643 T7c = T7a - T7b;
Chris@10 1644 ri[WS(rs, 26)] = T79 - T7c;
Chris@10 1645 ri[WS(rs, 10)] = T79 + T7c;
Chris@10 1646 T7Z = T77 - T74;
Chris@10 1647 T80 = T7X - T7U;
Chris@10 1648 ii[WS(rs, 10)] = T7Z + T80;
Chris@10 1649 ii[WS(rs, 26)] = T80 - T7Z;
Chris@10 1650 }
Chris@10 1651 }
Chris@10 1652 {
Chris@10 1653 E T3R, T5d, T8r, T8x, T4e, T8o, T5n, T5r, T4G, T5a, T5g, T8w, T5k, T5q, T57;
Chris@10 1654 E T5b, T3Q, T8p;
Chris@10 1655 T3Q = KP707106781 * (T3K - T3P);
Chris@10 1656 T3R = T3F - T3Q;
Chris@10 1657 T5d = T3F + T3Q;
Chris@10 1658 T8p = KP707106781 * (T5v - T5u);
Chris@10 1659 T8r = T8p + T8q;
Chris@10 1660 T8x = T8q - T8p;
Chris@10 1661 {
Chris@10 1662 E T42, T4d, T5l, T5m;
Chris@10 1663 T42 = FNMS(KP923879532, T41, KP382683432 * T3W);
Chris@10 1664 T4d = FMA(KP382683432, T47, KP923879532 * T4c);
Chris@10 1665 T4e = T42 - T4d;
Chris@10 1666 T8o = T42 + T4d;
Chris@10 1667 T5l = T4L + T4W;
Chris@10 1668 T5m = T52 + T55;
Chris@10 1669 T5n = FNMS(KP555570233, T5m, KP831469612 * T5l);
Chris@10 1670 T5r = FMA(KP831469612, T5m, KP555570233 * T5l);
Chris@10 1671 }
Chris@10 1672 {
Chris@10 1673 E T4w, T4F, T5e, T5f;
Chris@10 1674 T4w = T4k - T4v;
Chris@10 1675 T4F = T4B - T4E;
Chris@10 1676 T4G = FMA(KP980785280, T4w, KP195090322 * T4F);
Chris@10 1677 T5a = FNMS(KP980785280, T4F, KP195090322 * T4w);
Chris@10 1678 T5e = FMA(KP923879532, T3W, KP382683432 * T41);
Chris@10 1679 T5f = FNMS(KP923879532, T47, KP382683432 * T4c);
Chris@10 1680 T5g = T5e + T5f;
Chris@10 1681 T8w = T5f - T5e;
Chris@10 1682 }
Chris@10 1683 {
Chris@10 1684 E T5i, T5j, T4X, T56;
Chris@10 1685 T5i = T4k + T4v;
Chris@10 1686 T5j = T4B + T4E;
Chris@10 1687 T5k = FMA(KP555570233, T5i, KP831469612 * T5j);
Chris@10 1688 T5q = FNMS(KP555570233, T5j, KP831469612 * T5i);
Chris@10 1689 T4X = T4L - T4W;
Chris@10 1690 T56 = T52 - T55;
Chris@10 1691 T57 = FNMS(KP980785280, T56, KP195090322 * T4X);
Chris@10 1692 T5b = FMA(KP195090322, T56, KP980785280 * T4X);
Chris@10 1693 }
Chris@10 1694 {
Chris@10 1695 E T4f, T58, T8v, T8y;
Chris@10 1696 T4f = T3R + T4e;
Chris@10 1697 T58 = T4G + T57;
Chris@10 1698 ri[WS(rs, 23)] = T4f - T58;
Chris@10 1699 ri[WS(rs, 7)] = T4f + T58;
Chris@10 1700 T8v = T5a + T5b;
Chris@10 1701 T8y = T8w + T8x;
Chris@10 1702 ii[WS(rs, 7)] = T8v + T8y;
Chris@10 1703 ii[WS(rs, 23)] = T8y - T8v;
Chris@10 1704 }
Chris@10 1705 {
Chris@10 1706 E T59, T5c, T8z, T8A;
Chris@10 1707 T59 = T3R - T4e;
Chris@10 1708 T5c = T5a - T5b;
Chris@10 1709 ri[WS(rs, 31)] = T59 - T5c;
Chris@10 1710 ri[WS(rs, 15)] = T59 + T5c;
Chris@10 1711 T8z = T57 - T4G;
Chris@10 1712 T8A = T8x - T8w;
Chris@10 1713 ii[WS(rs, 15)] = T8z + T8A;
Chris@10 1714 ii[WS(rs, 31)] = T8A - T8z;
Chris@10 1715 }
Chris@10 1716 {
Chris@10 1717 E T5h, T5o, T8n, T8s;
Chris@10 1718 T5h = T5d + T5g;
Chris@10 1719 T5o = T5k + T5n;
Chris@10 1720 ri[WS(rs, 19)] = T5h - T5o;
Chris@10 1721 ri[WS(rs, 3)] = T5h + T5o;
Chris@10 1722 T8n = T5q + T5r;
Chris@10 1723 T8s = T8o + T8r;
Chris@10 1724 ii[WS(rs, 3)] = T8n + T8s;
Chris@10 1725 ii[WS(rs, 19)] = T8s - T8n;
Chris@10 1726 }
Chris@10 1727 {
Chris@10 1728 E T5p, T5s, T8t, T8u;
Chris@10 1729 T5p = T5d - T5g;
Chris@10 1730 T5s = T5q - T5r;
Chris@10 1731 ri[WS(rs, 27)] = T5p - T5s;
Chris@10 1732 ri[WS(rs, 11)] = T5p + T5s;
Chris@10 1733 T8t = T5n - T5k;
Chris@10 1734 T8u = T8r - T8o;
Chris@10 1735 ii[WS(rs, 11)] = T8t + T8u;
Chris@10 1736 ii[WS(rs, 27)] = T8u - T8t;
Chris@10 1737 }
Chris@10 1738 }
Chris@10 1739 {
Chris@10 1740 E T5x, T5Z, T8d, T8j, T5E, T88, T69, T6d, T5M, T5W, T62, T8i, T66, T6c, T5T;
Chris@10 1741 E T5X, T5w, T89;
Chris@10 1742 T5w = KP707106781 * (T5u + T5v);
Chris@10 1743 T5x = T5t - T5w;
Chris@10 1744 T5Z = T5t + T5w;
Chris@10 1745 T89 = KP707106781 * (T3K + T3P);
Chris@10 1746 T8d = T89 + T8c;
Chris@10 1747 T8j = T8c - T89;
Chris@10 1748 {
Chris@10 1749 E T5A, T5D, T67, T68;
Chris@10 1750 T5A = FNMS(KP382683432, T5z, KP923879532 * T5y);
Chris@10 1751 T5D = FMA(KP923879532, T5B, KP382683432 * T5C);
Chris@10 1752 T5E = T5A - T5D;
Chris@10 1753 T88 = T5A + T5D;
Chris@10 1754 T67 = T5N + T5O;
Chris@10 1755 T68 = T5Q + T5R;
Chris@10 1756 T69 = FNMS(KP195090322, T68, KP980785280 * T67);
Chris@10 1757 T6d = FMA(KP195090322, T67, KP980785280 * T68);
Chris@10 1758 }
Chris@10 1759 {
Chris@10 1760 E T5I, T5L, T60, T61;
Chris@10 1761 T5I = T5G - T5H;
Chris@10 1762 T5L = T5J - T5K;
Chris@10 1763 T5M = FMA(KP555570233, T5I, KP831469612 * T5L);
Chris@10 1764 T5W = FNMS(KP831469612, T5I, KP555570233 * T5L);
Chris@10 1765 T60 = FMA(KP382683432, T5y, KP923879532 * T5z);
Chris@10 1766 T61 = FNMS(KP382683432, T5B, KP923879532 * T5C);
Chris@10 1767 T62 = T60 + T61;
Chris@10 1768 T8i = T61 - T60;
Chris@10 1769 }
Chris@10 1770 {
Chris@10 1771 E T64, T65, T5P, T5S;
Chris@10 1772 T64 = T5G + T5H;
Chris@10 1773 T65 = T5J + T5K;
Chris@10 1774 T66 = FMA(KP980785280, T64, KP195090322 * T65);
Chris@10 1775 T6c = FNMS(KP195090322, T64, KP980785280 * T65);
Chris@10 1776 T5P = T5N - T5O;
Chris@10 1777 T5S = T5Q - T5R;
Chris@10 1778 T5T = FNMS(KP831469612, T5S, KP555570233 * T5P);
Chris@10 1779 T5X = FMA(KP831469612, T5P, KP555570233 * T5S);
Chris@10 1780 }
Chris@10 1781 {
Chris@10 1782 E T5F, T5U, T8h, T8k;
Chris@10 1783 T5F = T5x + T5E;
Chris@10 1784 T5U = T5M + T5T;
Chris@10 1785 ri[WS(rs, 21)] = T5F - T5U;
Chris@10 1786 ri[WS(rs, 5)] = T5F + T5U;
Chris@10 1787 T8h = T5W + T5X;
Chris@10 1788 T8k = T8i + T8j;
Chris@10 1789 ii[WS(rs, 5)] = T8h + T8k;
Chris@10 1790 ii[WS(rs, 21)] = T8k - T8h;
Chris@10 1791 }
Chris@10 1792 {
Chris@10 1793 E T5V, T5Y, T8l, T8m;
Chris@10 1794 T5V = T5x - T5E;
Chris@10 1795 T5Y = T5W - T5X;
Chris@10 1796 ri[WS(rs, 29)] = T5V - T5Y;
Chris@10 1797 ri[WS(rs, 13)] = T5V + T5Y;
Chris@10 1798 T8l = T5T - T5M;
Chris@10 1799 T8m = T8j - T8i;
Chris@10 1800 ii[WS(rs, 13)] = T8l + T8m;
Chris@10 1801 ii[WS(rs, 29)] = T8m - T8l;
Chris@10 1802 }
Chris@10 1803 {
Chris@10 1804 E T63, T6a, T87, T8e;
Chris@10 1805 T63 = T5Z + T62;
Chris@10 1806 T6a = T66 + T69;
Chris@10 1807 ri[WS(rs, 17)] = T63 - T6a;
Chris@10 1808 ri[WS(rs, 1)] = T63 + T6a;
Chris@10 1809 T87 = T6c + T6d;
Chris@10 1810 T8e = T88 + T8d;
Chris@10 1811 ii[WS(rs, 1)] = T87 + T8e;
Chris@10 1812 ii[WS(rs, 17)] = T8e - T87;
Chris@10 1813 }
Chris@10 1814 {
Chris@10 1815 E T6b, T6e, T8f, T8g;
Chris@10 1816 T6b = T5Z - T62;
Chris@10 1817 T6e = T6c - T6d;
Chris@10 1818 ri[WS(rs, 25)] = T6b - T6e;
Chris@10 1819 ri[WS(rs, 9)] = T6b + T6e;
Chris@10 1820 T8f = T69 - T66;
Chris@10 1821 T8g = T8d - T88;
Chris@10 1822 ii[WS(rs, 9)] = T8f + T8g;
Chris@10 1823 ii[WS(rs, 25)] = T8g - T8f;
Chris@10 1824 }
Chris@10 1825 }
Chris@10 1826 }
Chris@10 1827 }
Chris@10 1828 }
Chris@10 1829 }
Chris@10 1830
Chris@10 1831 static const tw_instr twinstr[] = {
Chris@10 1832 {TW_CEXP, 0, 1},
Chris@10 1833 {TW_CEXP, 0, 3},
Chris@10 1834 {TW_CEXP, 0, 9},
Chris@10 1835 {TW_CEXP, 0, 27},
Chris@10 1836 {TW_NEXT, 1, 0}
Chris@10 1837 };
Chris@10 1838
Chris@10 1839 static const ct_desc desc = { 32, "t2_32", twinstr, &GENUS, {376, 168, 112, 0}, 0, 0, 0 };
Chris@10 1840
Chris@10 1841 void X(codelet_t2_32) (planner *p) {
Chris@10 1842 X(kdft_dit_register) (p, t2_32, &desc);
Chris@10 1843 }
Chris@10 1844 #endif /* HAVE_FMA */