annotate src/fftw-3.3.5/rdft/scalar/r2cf/hf2_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:46:54 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -dit -name hf2_32 -include hf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 488 FP additions, 350 FP multiplications,
Chris@42 32 * (or, 236 additions, 98 multiplications, 252 fused multiply/add),
Chris@42 33 * 181 stack variables, 7 constants, and 128 memory accesses
Chris@42 34 */
Chris@42 35 #include "hf.h"
Chris@42 36
Chris@42 37 static void hf2_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 40 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 41 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 42 DK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 43 DK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 44 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 45 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 46 {
Chris@42 47 INT m;
Chris@42 48 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 49 E T7d, T7a;
Chris@42 50 {
Chris@42 51 E T2, T8, T3, T6, Te, Tr, T18, T4, Ta, Tz, T1n, T10, Ti, T5, Tc;
Chris@42 52 T2 = W[0];
Chris@42 53 T8 = W[4];
Chris@42 54 T3 = W[2];
Chris@42 55 T6 = W[3];
Chris@42 56 Te = W[6];
Chris@42 57 Tr = T2 * T8;
Chris@42 58 T18 = T3 * T8;
Chris@42 59 T4 = T2 * T3;
Chris@42 60 Ta = T2 * T6;
Chris@42 61 Tz = T3 * Te;
Chris@42 62 T1n = T8 * Te;
Chris@42 63 T10 = T2 * Te;
Chris@42 64 Ti = W[7];
Chris@42 65 T5 = W[1];
Chris@42 66 Tc = W[5];
Chris@42 67 {
Chris@42 68 E T34, T31, T2X, T2T, Tq, T46, T8H, T98, TH, T97, T4b, T8D, TZ, T7g, T4j;
Chris@42 69 E T6t, T1g, T7f, T4q, T6u, T4z, T6y, T1J, T7j, T7m, T8e, T6x, T4G, T2k, T7o;
Chris@42 70 E T7r, T8d, T6B, T4O, T6A, T4V, T6P, T61, T7G, T3G, T6M, T5E, T8n, T7N, T6I;
Chris@42 71 E T5s, T7v, T2N, T6F, T55, T8i, T7C, T5L, T63, T43, T7O, T5S, T62, T7J, T8o;
Chris@42 72 E T2U, T2R, T2V, T58, T3a, T5h, T2Y, T32, T35;
Chris@42 73 {
Chris@42 74 E T1K, T23, T1N, T26, T2b, T1U, T3C, T3j, T3z, T3f, T1R, T29, TR, Th, T2J;
Chris@42 75 E T2F, Td, TP, T3r, T3n, T2w, T2s, T3Q, T3M, T1Z, T1V, T2g, T2c;
Chris@42 76 {
Chris@42 77 E T11, T1C, TM, Tb, TJ, T7, T1o, T19, T1w, T1F, T15, T1s, T1d, T1z, TW;
Chris@42 78 E TS, Ty, T48, TG, T4a;
Chris@42 79 {
Chris@42 80 E T1, TA, Ts, TE, Tw, Tn, Tj, T8G, Tk, To, T14;
Chris@42 81 T1 = cr[0];
Chris@42 82 TA = FMA(T6, Ti, Tz);
Chris@42 83 T1K = FNMS(T6, Ti, Tz);
Chris@42 84 T14 = T2 * Ti;
Chris@42 85 {
Chris@42 86 E T1r, TD, T1c, Tv;
Chris@42 87 T1r = T8 * Ti;
Chris@42 88 TD = T3 * Ti;
Chris@42 89 T11 = FNMS(T5, Ti, T10);
Chris@42 90 T1C = FMA(T5, Ti, T10);
Chris@42 91 TM = FMA(T5, T3, Ta);
Chris@42 92 Tb = FNMS(T5, T3, Ta);
Chris@42 93 TJ = FNMS(T5, T6, T4);
Chris@42 94 T7 = FMA(T5, T6, T4);
Chris@42 95 T1o = FMA(Tc, Ti, T1n);
Chris@42 96 T23 = FMA(T6, Tc, T18);
Chris@42 97 T19 = FNMS(T6, Tc, T18);
Chris@42 98 T1w = FNMS(T5, Tc, Tr);
Chris@42 99 Ts = FMA(T5, Tc, Tr);
Chris@42 100 T1c = T3 * Tc;
Chris@42 101 Tv = T2 * Tc;
Chris@42 102 T1F = FNMS(T5, Te, T14);
Chris@42 103 T15 = FMA(T5, Te, T14);
Chris@42 104 T1s = FNMS(Tc, Te, T1r);
Chris@42 105 T1N = FMA(T6, Te, TD);
Chris@42 106 TE = FNMS(T6, Te, TD);
Chris@42 107 {
Chris@42 108 E T1T, T3i, T3e, T1Q;
Chris@42 109 T1T = TJ * Tc;
Chris@42 110 T3i = TJ * Ti;
Chris@42 111 T3e = TJ * Te;
Chris@42 112 T1Q = TJ * T8;
Chris@42 113 {
Chris@42 114 E Tg, T2I, T2E, T9;
Chris@42 115 Tg = T7 * Tc;
Chris@42 116 T2I = T7 * Ti;
Chris@42 117 T2E = T7 * Te;
Chris@42 118 T9 = T7 * T8;
Chris@42 119 {
Chris@42 120 E T3q, T3m, T2v, T2r;
Chris@42 121 T3q = T19 * Ti;
Chris@42 122 T3m = T19 * Te;
Chris@42 123 T2v = T1w * Ti;
Chris@42 124 T2r = T1w * Te;
Chris@42 125 {
Chris@42 126 E T2W, T2S, T3P, T3L;
Chris@42 127 T2W = T23 * Ti;
Chris@42 128 T2S = T23 * Te;
Chris@42 129 T3P = Ts * Ti;
Chris@42 130 T3L = Ts * Te;
Chris@42 131 T26 = FNMS(T6, T8, T1c);
Chris@42 132 T1d = FMA(T6, T8, T1c);
Chris@42 133 T1z = FMA(T5, T8, Tv);
Chris@42 134 Tw = FNMS(T5, T8, Tv);
Chris@42 135 T2b = FNMS(TM, T8, T1T);
Chris@42 136 T1U = FMA(TM, T8, T1T);
Chris@42 137 T3C = FNMS(TM, Te, T3i);
Chris@42 138 T3j = FMA(TM, Te, T3i);
Chris@42 139 T3z = FMA(TM, Ti, T3e);
Chris@42 140 T3f = FNMS(TM, Ti, T3e);
Chris@42 141 T1R = FNMS(TM, Tc, T1Q);
Chris@42 142 T29 = FMA(TM, Tc, T1Q);
Chris@42 143 TR = FNMS(Tb, T8, Tg);
Chris@42 144 Th = FMA(Tb, T8, Tg);
Chris@42 145 T34 = FMA(Tb, Te, T2I);
Chris@42 146 T2J = FNMS(Tb, Te, T2I);
Chris@42 147 T31 = FNMS(Tb, Ti, T2E);
Chris@42 148 T2F = FMA(Tb, Ti, T2E);
Chris@42 149 Td = FNMS(Tb, Tc, T9);
Chris@42 150 TP = FMA(Tb, Tc, T9);
Chris@42 151 T2X = FNMS(T26, Te, T2W);
Chris@42 152 T2T = FMA(T26, Ti, T2S);
Chris@42 153 T3r = FNMS(T1d, Te, T3q);
Chris@42 154 T3n = FMA(T1d, Ti, T3m);
Chris@42 155 T2w = FNMS(T1z, Te, T2v);
Chris@42 156 T2s = FMA(T1z, Ti, T2r);
Chris@42 157 T3Q = FNMS(Tw, Te, T3P);
Chris@42 158 T3M = FMA(Tw, Ti, T3L);
Chris@42 159 {
Chris@42 160 E T1Y, T1S, T2f, T2a;
Chris@42 161 T1Y = T1R * Ti;
Chris@42 162 T1S = T1R * Te;
Chris@42 163 T2f = T29 * Ti;
Chris@42 164 T2a = T29 * Te;
Chris@42 165 {
Chris@42 166 E Tm, Tf, TV, TQ;
Chris@42 167 Tm = Td * Ti;
Chris@42 168 Tf = Td * Te;
Chris@42 169 TV = TP * Ti;
Chris@42 170 TQ = TP * Te;
Chris@42 171 T1Z = FNMS(T1U, Te, T1Y);
Chris@42 172 T1V = FMA(T1U, Ti, T1S);
Chris@42 173 T2g = FNMS(T2b, Te, T2f);
Chris@42 174 T2c = FMA(T2b, Ti, T2a);
Chris@42 175 Tn = FNMS(Th, Te, Tm);
Chris@42 176 Tj = FMA(Th, Ti, Tf);
Chris@42 177 TW = FNMS(TR, Te, TV);
Chris@42 178 TS = FMA(TR, Ti, TQ);
Chris@42 179 T8G = ci[0];
Chris@42 180 }
Chris@42 181 }
Chris@42 182 }
Chris@42 183 }
Chris@42 184 }
Chris@42 185 }
Chris@42 186 }
Chris@42 187 Tk = cr[WS(rs, 16)];
Chris@42 188 To = ci[WS(rs, 16)];
Chris@42 189 {
Chris@42 190 E Tt, Tx, Tu, T47, TB, TF, TC, T49;
Chris@42 191 {
Chris@42 192 E Tl, T8E, Tp, T8F;
Chris@42 193 Tt = cr[WS(rs, 8)];
Chris@42 194 Tx = ci[WS(rs, 8)];
Chris@42 195 Tl = Tj * Tk;
Chris@42 196 T8E = Tj * To;
Chris@42 197 Tu = Ts * Tt;
Chris@42 198 T47 = Ts * Tx;
Chris@42 199 Tp = FMA(Tn, To, Tl);
Chris@42 200 T8F = FNMS(Tn, Tk, T8E);
Chris@42 201 TB = cr[WS(rs, 24)];
Chris@42 202 TF = ci[WS(rs, 24)];
Chris@42 203 Tq = T1 + Tp;
Chris@42 204 T46 = T1 - Tp;
Chris@42 205 T8H = T8F + T8G;
Chris@42 206 T98 = T8G - T8F;
Chris@42 207 TC = TA * TB;
Chris@42 208 T49 = TA * TF;
Chris@42 209 }
Chris@42 210 Ty = FMA(Tw, Tx, Tu);
Chris@42 211 T48 = FNMS(Tw, Tt, T47);
Chris@42 212 TG = FMA(TE, TF, TC);
Chris@42 213 T4a = FNMS(TE, TB, T49);
Chris@42 214 }
Chris@42 215 }
Chris@42 216 {
Chris@42 217 E TT, TX, TO, T4f, TU, T4g;
Chris@42 218 {
Chris@42 219 E TK, TN, TL, T4e;
Chris@42 220 TK = cr[WS(rs, 4)];
Chris@42 221 TN = ci[WS(rs, 4)];
Chris@42 222 TH = Ty + TG;
Chris@42 223 T97 = Ty - TG;
Chris@42 224 T4b = T48 - T4a;
Chris@42 225 T8D = T48 + T4a;
Chris@42 226 TL = TJ * TK;
Chris@42 227 T4e = TJ * TN;
Chris@42 228 TT = cr[WS(rs, 20)];
Chris@42 229 TX = ci[WS(rs, 20)];
Chris@42 230 TO = FMA(TM, TN, TL);
Chris@42 231 T4f = FNMS(TM, TK, T4e);
Chris@42 232 TU = TS * TT;
Chris@42 233 T4g = TS * TX;
Chris@42 234 }
Chris@42 235 {
Chris@42 236 E T17, T4m, T1a, T1e, T4d, T4i;
Chris@42 237 {
Chris@42 238 E T12, T16, TY, T4h, T13, T4l;
Chris@42 239 T12 = cr[WS(rs, 28)];
Chris@42 240 T16 = ci[WS(rs, 28)];
Chris@42 241 TY = FMA(TW, TX, TU);
Chris@42 242 T4h = FNMS(TW, TT, T4g);
Chris@42 243 T13 = T11 * T12;
Chris@42 244 T4l = T11 * T16;
Chris@42 245 TZ = TO + TY;
Chris@42 246 T4d = TO - TY;
Chris@42 247 T7g = T4f + T4h;
Chris@42 248 T4i = T4f - T4h;
Chris@42 249 T17 = FMA(T15, T16, T13);
Chris@42 250 T4m = FNMS(T15, T12, T4l);
Chris@42 251 }
Chris@42 252 T4j = T4d - T4i;
Chris@42 253 T6t = T4d + T4i;
Chris@42 254 T1a = cr[WS(rs, 12)];
Chris@42 255 T1e = ci[WS(rs, 12)];
Chris@42 256 {
Chris@42 257 E T1m, T4u, T1H, T4E, T1x, T1A, T1u, T4w, T1y, T4B;
Chris@42 258 {
Chris@42 259 E T1D, T1G, T1E, T4D;
Chris@42 260 {
Chris@42 261 E T1f, T4o, T4k, T4p;
Chris@42 262 {
Chris@42 263 E T1j, T1l, T1b, T4n, T1k, T4t;
Chris@42 264 T1j = cr[WS(rs, 2)];
Chris@42 265 T1l = ci[WS(rs, 2)];
Chris@42 266 T1b = T19 * T1a;
Chris@42 267 T4n = T19 * T1e;
Chris@42 268 T1k = T7 * T1j;
Chris@42 269 T4t = T7 * T1l;
Chris@42 270 T1f = FMA(T1d, T1e, T1b);
Chris@42 271 T4o = FNMS(T1d, T1a, T4n);
Chris@42 272 T1m = FMA(Tb, T1l, T1k);
Chris@42 273 T4u = FNMS(Tb, T1j, T4t);
Chris@42 274 }
Chris@42 275 T1g = T17 + T1f;
Chris@42 276 T4k = T17 - T1f;
Chris@42 277 T7f = T4m + T4o;
Chris@42 278 T4p = T4m - T4o;
Chris@42 279 T1D = cr[WS(rs, 26)];
Chris@42 280 T1G = ci[WS(rs, 26)];
Chris@42 281 T4q = T4k + T4p;
Chris@42 282 T6u = T4k - T4p;
Chris@42 283 T1E = T1C * T1D;
Chris@42 284 T4D = T1C * T1G;
Chris@42 285 }
Chris@42 286 {
Chris@42 287 E T1p, T1t, T1q, T4v;
Chris@42 288 T1p = cr[WS(rs, 18)];
Chris@42 289 T1t = ci[WS(rs, 18)];
Chris@42 290 T1H = FMA(T1F, T1G, T1E);
Chris@42 291 T4E = FNMS(T1F, T1D, T4D);
Chris@42 292 T1q = T1o * T1p;
Chris@42 293 T4v = T1o * T1t;
Chris@42 294 T1x = cr[WS(rs, 10)];
Chris@42 295 T1A = ci[WS(rs, 10)];
Chris@42 296 T1u = FMA(T1s, T1t, T1q);
Chris@42 297 T4w = FNMS(T1s, T1p, T4v);
Chris@42 298 T1y = T1w * T1x;
Chris@42 299 T4B = T1w * T1A;
Chris@42 300 }
Chris@42 301 }
Chris@42 302 {
Chris@42 303 E T4A, T1v, T7k, T4x, T1B, T4C;
Chris@42 304 T4A = T1m - T1u;
Chris@42 305 T1v = T1m + T1u;
Chris@42 306 T7k = T4u + T4w;
Chris@42 307 T4x = T4u - T4w;
Chris@42 308 T1B = FMA(T1z, T1A, T1y);
Chris@42 309 T4C = FNMS(T1z, T1x, T4B);
Chris@42 310 {
Chris@42 311 E T1I, T4y, T4F, T7l;
Chris@42 312 T1I = T1B + T1H;
Chris@42 313 T4y = T1B - T1H;
Chris@42 314 T4F = T4C - T4E;
Chris@42 315 T7l = T4C + T4E;
Chris@42 316 T4z = T4x + T4y;
Chris@42 317 T6y = T4x - T4y;
Chris@42 318 T1J = T1v + T1I;
Chris@42 319 T7j = T1v - T1I;
Chris@42 320 T7m = T7k - T7l;
Chris@42 321 T8e = T7k + T7l;
Chris@42 322 T6x = T4A + T4F;
Chris@42 323 T4G = T4A - T4F;
Chris@42 324 }
Chris@42 325 }
Chris@42 326 }
Chris@42 327 }
Chris@42 328 }
Chris@42 329 }
Chris@42 330 {
Chris@42 331 E T5C, T3u, T5y, T7L, T60, T5V, T3F, T5A, T4P, T4U;
Chris@42 332 {
Chris@42 333 E T1P, T4J, T2i, T4T, T21, T4L, T28, T4R;
Chris@42 334 {
Chris@42 335 E T1L, T1O, T1W, T20;
Chris@42 336 T1L = cr[WS(rs, 30)];
Chris@42 337 T1O = ci[WS(rs, 30)];
Chris@42 338 {
Chris@42 339 E T2d, T2h, T1M, T4I, T2e, T4S;
Chris@42 340 T2d = cr[WS(rs, 22)];
Chris@42 341 T2h = ci[WS(rs, 22)];
Chris@42 342 T1M = T1K * T1L;
Chris@42 343 T4I = T1K * T1O;
Chris@42 344 T2e = T2c * T2d;
Chris@42 345 T4S = T2c * T2h;
Chris@42 346 T1P = FMA(T1N, T1O, T1M);
Chris@42 347 T4J = FNMS(T1N, T1L, T4I);
Chris@42 348 T2i = FMA(T2g, T2h, T2e);
Chris@42 349 T4T = FNMS(T2g, T2d, T4S);
Chris@42 350 }
Chris@42 351 T1W = cr[WS(rs, 14)];
Chris@42 352 T20 = ci[WS(rs, 14)];
Chris@42 353 {
Chris@42 354 E T24, T27, T1X, T4K, T25, T4Q;
Chris@42 355 T24 = cr[WS(rs, 6)];
Chris@42 356 T27 = ci[WS(rs, 6)];
Chris@42 357 T1X = T1V * T1W;
Chris@42 358 T4K = T1V * T20;
Chris@42 359 T25 = T23 * T24;
Chris@42 360 T4Q = T23 * T27;
Chris@42 361 T21 = FMA(T1Z, T20, T1X);
Chris@42 362 T4L = FNMS(T1Z, T1W, T4K);
Chris@42 363 T28 = FMA(T26, T27, T25);
Chris@42 364 T4R = FNMS(T26, T24, T4Q);
Chris@42 365 }
Chris@42 366 }
Chris@42 367 {
Chris@42 368 E T22, T7p, T4M, T4N, T2j, T7q;
Chris@42 369 T4P = T1P - T21;
Chris@42 370 T22 = T1P + T21;
Chris@42 371 T7p = T4J + T4L;
Chris@42 372 T4M = T4J - T4L;
Chris@42 373 T4N = T28 - T2i;
Chris@42 374 T2j = T28 + T2i;
Chris@42 375 T7q = T4R + T4T;
Chris@42 376 T4U = T4R - T4T;
Chris@42 377 T2k = T22 + T2j;
Chris@42 378 T7o = T22 - T2j;
Chris@42 379 T7r = T7p - T7q;
Chris@42 380 T8d = T7p + T7q;
Chris@42 381 T6B = T4M - T4N;
Chris@42 382 T4O = T4M + T4N;
Chris@42 383 }
Chris@42 384 }
Chris@42 385 {
Chris@42 386 E T3l, T5X, T3E, T3v, T3t, T3w, T3x, T5Z, T3A, T3B, T3D, T3y, T5z;
Chris@42 387 {
Chris@42 388 E T3g, T3k, T3h, T5W;
Chris@42 389 T3g = cr[WS(rs, 31)];
Chris@42 390 T3k = ci[WS(rs, 31)];
Chris@42 391 T3A = cr[WS(rs, 23)];
Chris@42 392 T6A = T4P + T4U;
Chris@42 393 T4V = T4P - T4U;
Chris@42 394 T3h = T3f * T3g;
Chris@42 395 T5W = T3f * T3k;
Chris@42 396 T3B = T3z * T3A;
Chris@42 397 T3D = ci[WS(rs, 23)];
Chris@42 398 T3l = FMA(T3j, T3k, T3h);
Chris@42 399 T5X = FNMS(T3j, T3g, T5W);
Chris@42 400 }
Chris@42 401 {
Chris@42 402 E T3o, T5B, T3s, T3p, T5Y;
Chris@42 403 T3o = cr[WS(rs, 15)];
Chris@42 404 T3E = FMA(T3C, T3D, T3B);
Chris@42 405 T5B = T3z * T3D;
Chris@42 406 T3s = ci[WS(rs, 15)];
Chris@42 407 T3p = T3n * T3o;
Chris@42 408 T3v = cr[WS(rs, 7)];
Chris@42 409 T5C = FNMS(T3C, T3A, T5B);
Chris@42 410 T5Y = T3n * T3s;
Chris@42 411 T3t = FMA(T3r, T3s, T3p);
Chris@42 412 T3w = TP * T3v;
Chris@42 413 T3x = ci[WS(rs, 7)];
Chris@42 414 T5Z = FNMS(T3r, T3o, T5Y);
Chris@42 415 }
Chris@42 416 T3u = T3l + T3t;
Chris@42 417 T5y = T3l - T3t;
Chris@42 418 T3y = FMA(TR, T3x, T3w);
Chris@42 419 T5z = TP * T3x;
Chris@42 420 T7L = T5X + T5Z;
Chris@42 421 T60 = T5X - T5Z;
Chris@42 422 T5V = T3E - T3y;
Chris@42 423 T3F = T3y + T3E;
Chris@42 424 T5A = FNMS(TR, T3v, T5z);
Chris@42 425 }
Chris@42 426 {
Chris@42 427 E T2L, T53, T4Z, T2z, T7A, T5q, T2D, T51;
Chris@42 428 {
Chris@42 429 E T2q, T5n, T2y, T2A, T2C, T5p, T2B, T50;
Chris@42 430 {
Chris@42 431 E T2G, T2K, T2n, T5m, T2t, T5o;
Chris@42 432 {
Chris@42 433 E T2o, T2p, T5D, T7M;
Chris@42 434 T2n = cr[WS(rs, 1)];
Chris@42 435 T6P = T60 + T5V;
Chris@42 436 T61 = T5V - T60;
Chris@42 437 T7G = T3u - T3F;
Chris@42 438 T3G = T3u + T3F;
Chris@42 439 T5D = T5A - T5C;
Chris@42 440 T7M = T5A + T5C;
Chris@42 441 T2o = T2 * T2n;
Chris@42 442 T2p = ci[WS(rs, 1)];
Chris@42 443 T6M = T5y + T5D;
Chris@42 444 T5E = T5y - T5D;
Chris@42 445 T8n = T7L + T7M;
Chris@42 446 T7N = T7L - T7M;
Chris@42 447 T5m = T2 * T2p;
Chris@42 448 T2q = FMA(T5, T2p, T2o);
Chris@42 449 }
Chris@42 450 T2G = cr[WS(rs, 25)];
Chris@42 451 T2K = ci[WS(rs, 25)];
Chris@42 452 T5n = FNMS(T5, T2n, T5m);
Chris@42 453 {
Chris@42 454 E T2x, T2u, T2H, T52;
Chris@42 455 T2t = cr[WS(rs, 17)];
Chris@42 456 T2H = T2F * T2G;
Chris@42 457 T52 = T2F * T2K;
Chris@42 458 T2x = ci[WS(rs, 17)];
Chris@42 459 T2u = T2s * T2t;
Chris@42 460 T2L = FMA(T2J, T2K, T2H);
Chris@42 461 T53 = FNMS(T2J, T2G, T52);
Chris@42 462 T5o = T2s * T2x;
Chris@42 463 T2y = FMA(T2w, T2x, T2u);
Chris@42 464 }
Chris@42 465 T2A = cr[WS(rs, 9)];
Chris@42 466 T2C = ci[WS(rs, 9)];
Chris@42 467 T5p = FNMS(T2w, T2t, T5o);
Chris@42 468 }
Chris@42 469 T4Z = T2q - T2y;
Chris@42 470 T2z = T2q + T2y;
Chris@42 471 T2B = T8 * T2A;
Chris@42 472 T50 = T8 * T2C;
Chris@42 473 T7A = T5n + T5p;
Chris@42 474 T5q = T5n - T5p;
Chris@42 475 T2D = FMA(Tc, T2C, T2B);
Chris@42 476 T51 = FNMS(Tc, T2A, T50);
Chris@42 477 }
Chris@42 478 {
Chris@42 479 E T3N, T3K, T3O, T5H, T41, T5Q, T3R, T3U, T3W;
Chris@42 480 {
Chris@42 481 E T3H, T3I, T3J, T3Y, T40, T5G, T3Z, T5P;
Chris@42 482 T3H = cr[WS(rs, 3)];
Chris@42 483 {
Chris@42 484 E T5r, T2M, T54, T7B;
Chris@42 485 T5r = T2D - T2L;
Chris@42 486 T2M = T2D + T2L;
Chris@42 487 T54 = T51 - T53;
Chris@42 488 T7B = T51 + T53;
Chris@42 489 T6I = T5q - T5r;
Chris@42 490 T5s = T5q + T5r;
Chris@42 491 T7v = T2z - T2M;
Chris@42 492 T2N = T2z + T2M;
Chris@42 493 T6F = T4Z + T54;
Chris@42 494 T55 = T4Z - T54;
Chris@42 495 T8i = T7A + T7B;
Chris@42 496 T7C = T7A - T7B;
Chris@42 497 T3I = T3 * T3H;
Chris@42 498 }
Chris@42 499 T3J = ci[WS(rs, 3)];
Chris@42 500 T3Y = cr[WS(rs, 11)];
Chris@42 501 T40 = ci[WS(rs, 11)];
Chris@42 502 T3N = cr[WS(rs, 19)];
Chris@42 503 T3K = FMA(T6, T3J, T3I);
Chris@42 504 T5G = T3 * T3J;
Chris@42 505 T3Z = Td * T3Y;
Chris@42 506 T5P = Td * T40;
Chris@42 507 T3O = T3M * T3N;
Chris@42 508 T5H = FNMS(T6, T3H, T5G);
Chris@42 509 T41 = FMA(Th, T40, T3Z);
Chris@42 510 T5Q = FNMS(Th, T3Y, T5P);
Chris@42 511 T3R = ci[WS(rs, 19)];
Chris@42 512 T3U = cr[WS(rs, 27)];
Chris@42 513 T3W = ci[WS(rs, 27)];
Chris@42 514 }
Chris@42 515 {
Chris@42 516 E T2O, T2P, T2Q, T37, T39, T57, T38, T5g;
Chris@42 517 {
Chris@42 518 E T3T, T5F, T5J, T3X, T5O, T7I, T5K;
Chris@42 519 T2O = cr[WS(rs, 5)];
Chris@42 520 {
Chris@42 521 E T3S, T5I, T3V, T5N;
Chris@42 522 T3S = FMA(T3Q, T3R, T3O);
Chris@42 523 T5I = T3M * T3R;
Chris@42 524 T3V = Te * T3U;
Chris@42 525 T5N = Te * T3W;
Chris@42 526 T3T = T3K + T3S;
Chris@42 527 T5F = T3K - T3S;
Chris@42 528 T5J = FNMS(T3Q, T3N, T5I);
Chris@42 529 T3X = FMA(Ti, T3W, T3V);
Chris@42 530 T5O = FNMS(Ti, T3U, T5N);
Chris@42 531 T2P = T29 * T2O;
Chris@42 532 }
Chris@42 533 T7I = T5H + T5J;
Chris@42 534 T5K = T5H - T5J;
Chris@42 535 {
Chris@42 536 E T42, T5M, T7H, T5R;
Chris@42 537 T42 = T3X + T41;
Chris@42 538 T5M = T3X - T41;
Chris@42 539 T7H = T5O + T5Q;
Chris@42 540 T5R = T5O - T5Q;
Chris@42 541 T5L = T5F - T5K;
Chris@42 542 T63 = T5F + T5K;
Chris@42 543 T43 = T3T + T42;
Chris@42 544 T7O = T42 - T3T;
Chris@42 545 T5S = T5M + T5R;
Chris@42 546 T62 = T5M - T5R;
Chris@42 547 T7J = T7H - T7I;
Chris@42 548 T8o = T7I + T7H;
Chris@42 549 T2Q = ci[WS(rs, 5)];
Chris@42 550 }
Chris@42 551 }
Chris@42 552 T37 = cr[WS(rs, 13)];
Chris@42 553 T39 = ci[WS(rs, 13)];
Chris@42 554 T2U = cr[WS(rs, 21)];
Chris@42 555 T2R = FMA(T2b, T2Q, T2P);
Chris@42 556 T57 = T29 * T2Q;
Chris@42 557 T38 = T1R * T37;
Chris@42 558 T5g = T1R * T39;
Chris@42 559 T2V = T2T * T2U;
Chris@42 560 T58 = FNMS(T2b, T2O, T57);
Chris@42 561 T3a = FMA(T1U, T39, T38);
Chris@42 562 T5h = FNMS(T1U, T37, T5g);
Chris@42 563 T2Y = ci[WS(rs, 21)];
Chris@42 564 T32 = cr[WS(rs, 29)];
Chris@42 565 T35 = ci[WS(rs, 29)];
Chris@42 566 }
Chris@42 567 }
Chris@42 568 }
Chris@42 569 }
Chris@42 570 }
Chris@42 571 {
Chris@42 572 E T7e, T8T, T7D, T7y, T7h, T8U, T6s, T9o, T9n, T6v, T6Q, T6N, T6J, T6G, T6o;
Chris@42 573 E T6r;
Chris@42 574 {
Chris@42 575 E T5c, T5t, T5j, T5u, T8s, T8v;
Chris@42 576 {
Chris@42 577 E T8c, T1i, T8A, T8z, T8O, T8J, T8N, T2l, T8L, T45, T8t, T8l, T8u, T8q, T3c;
Chris@42 578 E T8p, T8k, T8w, T2m;
Chris@42 579 {
Chris@42 580 E T8x, T8y, T8j, T8C, T8I;
Chris@42 581 {
Chris@42 582 E TI, T30, T56, T5a, T36, T5f, T1h, T7x, T5b;
Chris@42 583 TI = Tq + TH;
Chris@42 584 T7e = Tq - TH;
Chris@42 585 {
Chris@42 586 E T2Z, T59, T33, T5e;
Chris@42 587 T2Z = FMA(T2X, T2Y, T2V);
Chris@42 588 T59 = T2T * T2Y;
Chris@42 589 T33 = T31 * T32;
Chris@42 590 T5e = T31 * T35;
Chris@42 591 T30 = T2R + T2Z;
Chris@42 592 T56 = T2R - T2Z;
Chris@42 593 T5a = FNMS(T2X, T2U, T59);
Chris@42 594 T36 = FMA(T34, T35, T33);
Chris@42 595 T5f = FNMS(T34, T32, T5e);
Chris@42 596 T1h = TZ + T1g;
Chris@42 597 T8T = TZ - T1g;
Chris@42 598 }
Chris@42 599 T7x = T58 + T5a;
Chris@42 600 T5b = T58 - T5a;
Chris@42 601 {
Chris@42 602 E T3b, T5d, T7w, T5i;
Chris@42 603 T3b = T36 + T3a;
Chris@42 604 T5d = T36 - T3a;
Chris@42 605 T7w = T5f + T5h;
Chris@42 606 T5i = T5f - T5h;
Chris@42 607 T5c = T56 - T5b;
Chris@42 608 T5t = T56 + T5b;
Chris@42 609 T3c = T30 + T3b;
Chris@42 610 T7D = T30 - T3b;
Chris@42 611 T5j = T5d + T5i;
Chris@42 612 T5u = T5i - T5d;
Chris@42 613 T7y = T7w - T7x;
Chris@42 614 T8j = T7x + T7w;
Chris@42 615 T8c = TI - T1h;
Chris@42 616 T1i = TI + T1h;
Chris@42 617 }
Chris@42 618 }
Chris@42 619 T8p = T8n - T8o;
Chris@42 620 T8x = T8n + T8o;
Chris@42 621 T8y = T8i + T8j;
Chris@42 622 T8k = T8i - T8j;
Chris@42 623 T7h = T7f - T7g;
Chris@42 624 T8C = T7g + T7f;
Chris@42 625 T8I = T8D + T8H;
Chris@42 626 T8U = T8H - T8D;
Chris@42 627 T8A = T8y + T8x;
Chris@42 628 T8z = T8x - T8y;
Chris@42 629 T8O = T8I - T8C;
Chris@42 630 T8J = T8C + T8I;
Chris@42 631 }
Chris@42 632 {
Chris@42 633 E T8h, T8m, T3d, T44;
Chris@42 634 T8h = T2N - T3c;
Chris@42 635 T3d = T2N + T3c;
Chris@42 636 T44 = T3G + T43;
Chris@42 637 T8m = T3G - T43;
Chris@42 638 T8N = T1J - T2k;
Chris@42 639 T2l = T1J + T2k;
Chris@42 640 T8L = T44 - T3d;
Chris@42 641 T45 = T3d + T44;
Chris@42 642 T8t = T8h - T8k;
Chris@42 643 T8l = T8h + T8k;
Chris@42 644 T8u = T8m + T8p;
Chris@42 645 T8q = T8m - T8p;
Chris@42 646 }
Chris@42 647 T8w = T1i - T2l;
Chris@42 648 T2m = T1i + T2l;
Chris@42 649 {
Chris@42 650 E T8Q, T8R, T8P, T8S;
Chris@42 651 {
Chris@42 652 E T8r, T8M, T8K, T8g, T8B, T8f;
Chris@42 653 T8Q = T8q - T8l;
Chris@42 654 T8r = T8l + T8q;
Chris@42 655 T8B = T8e + T8d;
Chris@42 656 T8f = T8d - T8e;
Chris@42 657 cr[0] = T2m + T45;
Chris@42 658 ci[WS(rs, 15)] = T2m - T45;
Chris@42 659 ci[WS(rs, 7)] = T8w + T8z;
Chris@42 660 cr[WS(rs, 8)] = T8w - T8z;
Chris@42 661 T8M = T8J - T8B;
Chris@42 662 T8K = T8B + T8J;
Chris@42 663 T8g = T8c - T8f;
Chris@42 664 T8s = T8c + T8f;
Chris@42 665 T8R = T8O - T8N;
Chris@42 666 T8P = T8N + T8O;
Chris@42 667 ci[WS(rs, 23)] = T8L + T8M;
Chris@42 668 cr[WS(rs, 24)] = T8L - T8M;
Chris@42 669 ci[WS(rs, 31)] = T8A + T8K;
Chris@42 670 cr[WS(rs, 16)] = T8A - T8K;
Chris@42 671 cr[WS(rs, 4)] = FMA(KP707106781, T8r, T8g);
Chris@42 672 ci[WS(rs, 11)] = FNMS(KP707106781, T8r, T8g);
Chris@42 673 }
Chris@42 674 T8S = T8u - T8t;
Chris@42 675 T8v = T8t + T8u;
Chris@42 676 ci[WS(rs, 19)] = FMA(KP707106781, T8Q, T8P);
Chris@42 677 cr[WS(rs, 28)] = FMS(KP707106781, T8Q, T8P);
Chris@42 678 ci[WS(rs, 27)] = FMA(KP707106781, T8S, T8R);
Chris@42 679 cr[WS(rs, 20)] = FMS(KP707106781, T8S, T8R);
Chris@42 680 }
Chris@42 681 }
Chris@42 682 {
Chris@42 683 E T6c, T4s, T9c, T4X, T9h, T9b, T9i, T6f, T5l, T6h, T6m, T6q, T6a, T66, T5v;
Chris@42 684 {
Chris@42 685 E T6d, T4H, T4W, T6e, T99, T9a, T4c, T4r, T5T, T64;
Chris@42 686 T6s = T46 + T4b;
Chris@42 687 T4c = T46 - T4b;
Chris@42 688 T4r = T4j + T4q;
Chris@42 689 T9o = T4q - T4j;
Chris@42 690 T6d = FNMS(KP414213562, T4z, T4G);
Chris@42 691 T4H = FMA(KP414213562, T4G, T4z);
Chris@42 692 ci[WS(rs, 3)] = FMA(KP707106781, T8v, T8s);
Chris@42 693 cr[WS(rs, 12)] = FNMS(KP707106781, T8v, T8s);
Chris@42 694 T6c = FMA(KP707106781, T4r, T4c);
Chris@42 695 T4s = FNMS(KP707106781, T4r, T4c);
Chris@42 696 T4W = FNMS(KP414213562, T4V, T4O);
Chris@42 697 T6e = FMA(KP414213562, T4O, T4V);
Chris@42 698 T9n = T98 - T97;
Chris@42 699 T99 = T97 + T98;
Chris@42 700 T9a = T6t - T6u;
Chris@42 701 T6v = T6t + T6u;
Chris@42 702 T9c = T4H + T4W;
Chris@42 703 T4X = T4H - T4W;
Chris@42 704 T9h = FNMS(KP707106781, T9a, T99);
Chris@42 705 T9b = FMA(KP707106781, T9a, T99);
Chris@42 706 T6Q = T5S - T5L;
Chris@42 707 T5T = T5L + T5S;
Chris@42 708 T64 = T62 - T63;
Chris@42 709 T6N = T63 + T62;
Chris@42 710 {
Chris@42 711 E T6k, T5U, T6l, T65, T5k;
Chris@42 712 T6J = T5j - T5c;
Chris@42 713 T5k = T5c + T5j;
Chris@42 714 T9i = T6e - T6d;
Chris@42 715 T6f = T6d + T6e;
Chris@42 716 T6k = FMA(KP707106781, T5T, T5E);
Chris@42 717 T5U = FNMS(KP707106781, T5T, T5E);
Chris@42 718 T6l = FMA(KP707106781, T64, T61);
Chris@42 719 T65 = FNMS(KP707106781, T64, T61);
Chris@42 720 T5l = FNMS(KP707106781, T5k, T55);
Chris@42 721 T6h = FMA(KP707106781, T5k, T55);
Chris@42 722 T6m = FNMS(KP198912367, T6l, T6k);
Chris@42 723 T6q = FMA(KP198912367, T6k, T6l);
Chris@42 724 T6a = FNMS(KP668178637, T5U, T65);
Chris@42 725 T66 = FMA(KP668178637, T65, T5U);
Chris@42 726 T5v = T5t + T5u;
Chris@42 727 T6G = T5t - T5u;
Chris@42 728 }
Chris@42 729 }
Chris@42 730 {
Chris@42 731 E T68, T4Y, T9j, T9l, T6i, T5w;
Chris@42 732 T68 = FNMS(KP923879532, T4X, T4s);
Chris@42 733 T4Y = FMA(KP923879532, T4X, T4s);
Chris@42 734 T9j = FMA(KP923879532, T9i, T9h);
Chris@42 735 T9l = FNMS(KP923879532, T9i, T9h);
Chris@42 736 T6i = FMA(KP707106781, T5v, T5s);
Chris@42 737 T5w = FNMS(KP707106781, T5v, T5s);
Chris@42 738 {
Chris@42 739 E T9g, T9f, T9d, T9e;
Chris@42 740 {
Chris@42 741 E T6g, T6p, T69, T5x, T6n, T6j;
Chris@42 742 T6o = FNMS(KP923879532, T6f, T6c);
Chris@42 743 T6g = FMA(KP923879532, T6f, T6c);
Chris@42 744 T6j = FNMS(KP198912367, T6i, T6h);
Chris@42 745 T6p = FMA(KP198912367, T6h, T6i);
Chris@42 746 T69 = FNMS(KP668178637, T5l, T5w);
Chris@42 747 T5x = FMA(KP668178637, T5w, T5l);
Chris@42 748 T6n = T6j + T6m;
Chris@42 749 T9g = T6m - T6j;
Chris@42 750 T9f = FNMS(KP923879532, T9c, T9b);
Chris@42 751 T9d = FMA(KP923879532, T9c, T9b);
Chris@42 752 {
Chris@42 753 E T6b, T9k, T9m, T67;
Chris@42 754 T6b = T69 + T6a;
Chris@42 755 T9k = T69 - T6a;
Chris@42 756 T9m = T66 - T5x;
Chris@42 757 T67 = T5x + T66;
Chris@42 758 ci[0] = FMA(KP980785280, T6n, T6g);
Chris@42 759 cr[WS(rs, 15)] = FNMS(KP980785280, T6n, T6g);
Chris@42 760 ci[WS(rs, 4)] = FNMS(KP831469612, T6b, T68);
Chris@42 761 cr[WS(rs, 11)] = FMA(KP831469612, T6b, T68);
Chris@42 762 ci[WS(rs, 28)] = FMA(KP831469612, T9k, T9j);
Chris@42 763 cr[WS(rs, 19)] = FMS(KP831469612, T9k, T9j);
Chris@42 764 ci[WS(rs, 20)] = FMA(KP831469612, T9m, T9l);
Chris@42 765 cr[WS(rs, 27)] = FMS(KP831469612, T9m, T9l);
Chris@42 766 cr[WS(rs, 3)] = FMA(KP831469612, T67, T4Y);
Chris@42 767 ci[WS(rs, 12)] = FNMS(KP831469612, T67, T4Y);
Chris@42 768 T9e = T6q - T6p;
Chris@42 769 T6r = T6p + T6q;
Chris@42 770 }
Chris@42 771 }
Chris@42 772 ci[WS(rs, 16)] = FMA(KP980785280, T9e, T9d);
Chris@42 773 cr[WS(rs, 31)] = FMS(KP980785280, T9e, T9d);
Chris@42 774 ci[WS(rs, 24)] = FMA(KP980785280, T9g, T9f);
Chris@42 775 cr[WS(rs, 23)] = FMS(KP980785280, T9g, T9f);
Chris@42 776 }
Chris@42 777 }
Chris@42 778 }
Chris@42 779 }
Chris@42 780 {
Chris@42 781 E T88, T90, T8Z, T8b;
Chris@42 782 {
Chris@42 783 E T7K, T7W, T7i, T7P, T8a, T86, T91, T8V, T8W, T7t, T7U, T7F, T92, T7Z, T89;
Chris@42 784 E T83;
Chris@42 785 {
Chris@42 786 E T7X, T7n, T7s, T7Y, T84, T85;
Chris@42 787 T7K = T7G - T7J;
Chris@42 788 T84 = T7G + T7J;
Chris@42 789 cr[WS(rs, 7)] = FMA(KP980785280, T6r, T6o);
Chris@42 790 ci[WS(rs, 8)] = FNMS(KP980785280, T6r, T6o);
Chris@42 791 T7W = T7e + T7h;
Chris@42 792 T7i = T7e - T7h;
Chris@42 793 T85 = T7O - T7N;
Chris@42 794 T7P = T7N + T7O;
Chris@42 795 T7X = T7j - T7m;
Chris@42 796 T7n = T7j + T7m;
Chris@42 797 T8a = FMA(KP414213562, T84, T85);
Chris@42 798 T86 = FNMS(KP414213562, T85, T84);
Chris@42 799 T91 = T8U - T8T;
Chris@42 800 T8V = T8T + T8U;
Chris@42 801 T7s = T7o - T7r;
Chris@42 802 T7Y = T7o + T7r;
Chris@42 803 {
Chris@42 804 E T81, T82, T7z, T7E;
Chris@42 805 T81 = T7v + T7y;
Chris@42 806 T7z = T7v - T7y;
Chris@42 807 T7E = T7C - T7D;
Chris@42 808 T82 = T7C + T7D;
Chris@42 809 T8W = T7n - T7s;
Chris@42 810 T7t = T7n + T7s;
Chris@42 811 T7U = FNMS(KP414213562, T7z, T7E);
Chris@42 812 T7F = FMA(KP414213562, T7E, T7z);
Chris@42 813 T92 = T7Y - T7X;
Chris@42 814 T7Z = T7X + T7Y;
Chris@42 815 T89 = FMA(KP414213562, T81, T82);
Chris@42 816 T83 = FNMS(KP414213562, T82, T81);
Chris@42 817 }
Chris@42 818 }
Chris@42 819 {
Chris@42 820 E T7S, T7u, T93, T95, T7T, T7Q;
Chris@42 821 T7S = FNMS(KP707106781, T7t, T7i);
Chris@42 822 T7u = FMA(KP707106781, T7t, T7i);
Chris@42 823 T93 = FMA(KP707106781, T92, T91);
Chris@42 824 T95 = FNMS(KP707106781, T92, T91);
Chris@42 825 T7T = FMA(KP414213562, T7K, T7P);
Chris@42 826 T7Q = FNMS(KP414213562, T7P, T7K);
Chris@42 827 {
Chris@42 828 E T80, T87, T8X, T8Y;
Chris@42 829 T88 = FNMS(KP707106781, T7Z, T7W);
Chris@42 830 T80 = FMA(KP707106781, T7Z, T7W);
Chris@42 831 {
Chris@42 832 E T7V, T94, T96, T7R;
Chris@42 833 T7V = T7T - T7U;
Chris@42 834 T94 = T7U + T7T;
Chris@42 835 T96 = T7Q - T7F;
Chris@42 836 T7R = T7F + T7Q;
Chris@42 837 ci[WS(rs, 5)] = FMA(KP923879532, T7V, T7S);
Chris@42 838 cr[WS(rs, 10)] = FNMS(KP923879532, T7V, T7S);
Chris@42 839 ci[WS(rs, 29)] = FMA(KP923879532, T94, T93);
Chris@42 840 cr[WS(rs, 18)] = FMS(KP923879532, T94, T93);
Chris@42 841 ci[WS(rs, 21)] = FMA(KP923879532, T96, T95);
Chris@42 842 cr[WS(rs, 26)] = FMS(KP923879532, T96, T95);
Chris@42 843 cr[WS(rs, 2)] = FMA(KP923879532, T7R, T7u);
Chris@42 844 ci[WS(rs, 13)] = FNMS(KP923879532, T7R, T7u);
Chris@42 845 T87 = T83 + T86;
Chris@42 846 T90 = T86 - T83;
Chris@42 847 }
Chris@42 848 T8Z = FNMS(KP707106781, T8W, T8V);
Chris@42 849 T8X = FMA(KP707106781, T8W, T8V);
Chris@42 850 T8Y = T8a - T89;
Chris@42 851 T8b = T89 + T8a;
Chris@42 852 ci[WS(rs, 1)] = FMA(KP923879532, T87, T80);
Chris@42 853 cr[WS(rs, 14)] = FNMS(KP923879532, T87, T80);
Chris@42 854 ci[WS(rs, 17)] = FMA(KP923879532, T8Y, T8X);
Chris@42 855 cr[WS(rs, 30)] = FMS(KP923879532, T8Y, T8X);
Chris@42 856 }
Chris@42 857 }
Chris@42 858 }
Chris@42 859 {
Chris@42 860 E T6Y, T6w, T9w, T6D, T9v, T9p, T9q, T71, T6O, T76;
Chris@42 861 {
Chris@42 862 E T70, T6Z, T6z, T6C;
Chris@42 863 ci[WS(rs, 25)] = FMA(KP923879532, T90, T8Z);
Chris@42 864 cr[WS(rs, 22)] = FMS(KP923879532, T90, T8Z);
Chris@42 865 cr[WS(rs, 6)] = FMA(KP923879532, T8b, T88);
Chris@42 866 ci[WS(rs, 9)] = FNMS(KP923879532, T8b, T88);
Chris@42 867 T70 = FNMS(KP414213562, T6x, T6y);
Chris@42 868 T6z = FMA(KP414213562, T6y, T6x);
Chris@42 869 T6C = FNMS(KP414213562, T6B, T6A);
Chris@42 870 T6Z = FMA(KP414213562, T6A, T6B);
Chris@42 871 T6Y = FNMS(KP707106781, T6v, T6s);
Chris@42 872 T6w = FMA(KP707106781, T6v, T6s);
Chris@42 873 T9w = T6z - T6C;
Chris@42 874 T6D = T6z + T6C;
Chris@42 875 T9v = FNMS(KP707106781, T9o, T9n);
Chris@42 876 T9p = FMA(KP707106781, T9o, T9n);
Chris@42 877 T9q = T70 + T6Z;
Chris@42 878 T71 = T6Z - T70;
Chris@42 879 T6O = FMA(KP707106781, T6N, T6M);
Chris@42 880 T76 = FNMS(KP707106781, T6N, T6M);
Chris@42 881 }
Chris@42 882 {
Chris@42 883 E T6U, T9u, T79, T6X, T9s, T9t, T9r, T72;
Chris@42 884 {
Chris@42 885 E T6E, T78, T6V, T6S, T75, T6W, T6L, T9x, T9z, T9y, T6T, T9A;
Chris@42 886 {
Chris@42 887 E T7c, T7b, T77, T6R;
Chris@42 888 T6U = FNMS(KP923879532, T6D, T6w);
Chris@42 889 T6E = FMA(KP923879532, T6D, T6w);
Chris@42 890 T77 = FNMS(KP707106781, T6Q, T6P);
Chris@42 891 T6R = FMA(KP707106781, T6Q, T6P);
Chris@42 892 {
Chris@42 893 E T73, T6H, T74, T6K;
Chris@42 894 T73 = FNMS(KP707106781, T6G, T6F);
Chris@42 895 T6H = FMA(KP707106781, T6G, T6F);
Chris@42 896 T74 = FNMS(KP707106781, T6J, T6I);
Chris@42 897 T6K = FMA(KP707106781, T6J, T6I);
Chris@42 898 T78 = FMA(KP668178637, T77, T76);
Chris@42 899 T7c = FNMS(KP668178637, T76, T77);
Chris@42 900 T6V = FMA(KP198912367, T6O, T6R);
Chris@42 901 T6S = FNMS(KP198912367, T6R, T6O);
Chris@42 902 T75 = FNMS(KP668178637, T74, T73);
Chris@42 903 T7b = FMA(KP668178637, T73, T74);
Chris@42 904 T6W = FNMS(KP198912367, T6H, T6K);
Chris@42 905 T6L = FMA(KP198912367, T6K, T6H);
Chris@42 906 }
Chris@42 907 T9x = FMA(KP923879532, T9w, T9v);
Chris@42 908 T9z = FNMS(KP923879532, T9w, T9v);
Chris@42 909 T7d = T7b - T7c;
Chris@42 910 T9y = T7b + T7c;
Chris@42 911 }
Chris@42 912 T9u = T6S - T6L;
Chris@42 913 T6T = T6L + T6S;
Chris@42 914 T9A = T78 - T75;
Chris@42 915 T79 = T75 + T78;
Chris@42 916 ci[WS(rs, 18)] = FNMS(KP831469612, T9y, T9x);
Chris@42 917 cr[WS(rs, 29)] = -(FMA(KP831469612, T9y, T9x));
Chris@42 918 cr[WS(rs, 1)] = FMA(KP980785280, T6T, T6E);
Chris@42 919 ci[WS(rs, 14)] = FNMS(KP980785280, T6T, T6E);
Chris@42 920 cr[WS(rs, 21)] = FMS(KP831469612, T9A, T9z);
Chris@42 921 ci[WS(rs, 26)] = FMA(KP831469612, T9A, T9z);
Chris@42 922 T6X = T6V - T6W;
Chris@42 923 T9s = T6W + T6V;
Chris@42 924 }
Chris@42 925 T7a = FNMS(KP923879532, T71, T6Y);
Chris@42 926 T72 = FMA(KP923879532, T71, T6Y);
Chris@42 927 T9t = FNMS(KP923879532, T9q, T9p);
Chris@42 928 T9r = FMA(KP923879532, T9q, T9p);
Chris@42 929 ci[WS(rs, 6)] = FMA(KP980785280, T6X, T6U);
Chris@42 930 cr[WS(rs, 9)] = FNMS(KP980785280, T6X, T6U);
Chris@42 931 ci[WS(rs, 2)] = FMA(KP831469612, T79, T72);
Chris@42 932 cr[WS(rs, 13)] = FNMS(KP831469612, T79, T72);
Chris@42 933 ci[WS(rs, 30)] = FMA(KP980785280, T9s, T9r);
Chris@42 934 cr[WS(rs, 17)] = FMS(KP980785280, T9s, T9r);
Chris@42 935 ci[WS(rs, 22)] = FMA(KP980785280, T9u, T9t);
Chris@42 936 cr[WS(rs, 25)] = FMS(KP980785280, T9u, T9t);
Chris@42 937 }
Chris@42 938 }
Chris@42 939 }
Chris@42 940 }
Chris@42 941 }
Chris@42 942 }
Chris@42 943 cr[WS(rs, 5)] = FMA(KP831469612, T7d, T7a);
Chris@42 944 ci[WS(rs, 10)] = FNMS(KP831469612, T7d, T7a);
Chris@42 945 }
Chris@42 946 }
Chris@42 947 }
Chris@42 948
Chris@42 949 static const tw_instr twinstr[] = {
Chris@42 950 {TW_CEXP, 1, 1},
Chris@42 951 {TW_CEXP, 1, 3},
Chris@42 952 {TW_CEXP, 1, 9},
Chris@42 953 {TW_CEXP, 1, 27},
Chris@42 954 {TW_NEXT, 1, 0}
Chris@42 955 };
Chris@42 956
Chris@42 957 static const hc2hc_desc desc = { 32, "hf2_32", twinstr, &GENUS, {236, 98, 252, 0} };
Chris@42 958
Chris@42 959 void X(codelet_hf2_32) (planner *p) {
Chris@42 960 X(khc2hc_register) (p, hf2_32, &desc);
Chris@42 961 }
Chris@42 962 #else /* HAVE_FMA */
Chris@42 963
Chris@42 964 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -dit -name hf2_32 -include hf.h */
Chris@42 965
Chris@42 966 /*
Chris@42 967 * This function contains 488 FP additions, 280 FP multiplications,
Chris@42 968 * (or, 376 additions, 168 multiplications, 112 fused multiply/add),
Chris@42 969 * 158 stack variables, 7 constants, and 128 memory accesses
Chris@42 970 */
Chris@42 971 #include "hf.h"
Chris@42 972
Chris@42 973 static void hf2_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 974 {
Chris@42 975 DK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@42 976 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 977 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 978 DK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@42 979 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 980 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 981 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 982 {
Chris@42 983 INT m;
Chris@42 984 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 985 E T2, T5, T3, T6, T8, TM, TO, Td, T9, Te, Th, Tl, TD, TH, T1y;
Chris@42 986 E T1H, T15, T1A, T11, T1F, T1n, T1p, T2q, T2I, T2u, T2K, T2V, T3b, T2Z, T3d;
Chris@42 987 E Tu, Ty, T3l, T3n, T1t, T1v, T2f, T2h, T1a, T1e, T32, T34, T1W, T1Y, T2C;
Chris@42 988 E T2E, Tg, TR, Tk, TS, Tm, TV, To, TT, T1M, T21, T1P, T22, T1Q, T25;
Chris@42 989 E T1S, T23;
Chris@42 990 {
Chris@42 991 E Ts, T1d, Tx, T18, Tt, T1c, Tw, T19, TB, T14, TG, TZ, TC, T13, TF;
Chris@42 992 E T10;
Chris@42 993 {
Chris@42 994 E T4, Tc, T7, Tb;
Chris@42 995 T2 = W[0];
Chris@42 996 T5 = W[1];
Chris@42 997 T3 = W[2];
Chris@42 998 T6 = W[3];
Chris@42 999 T4 = T2 * T3;
Chris@42 1000 Tc = T5 * T3;
Chris@42 1001 T7 = T5 * T6;
Chris@42 1002 Tb = T2 * T6;
Chris@42 1003 T8 = T4 + T7;
Chris@42 1004 TM = T4 - T7;
Chris@42 1005 TO = Tb + Tc;
Chris@42 1006 Td = Tb - Tc;
Chris@42 1007 T9 = W[4];
Chris@42 1008 Ts = T2 * T9;
Chris@42 1009 T1d = T6 * T9;
Chris@42 1010 Tx = T5 * T9;
Chris@42 1011 T18 = T3 * T9;
Chris@42 1012 Te = W[5];
Chris@42 1013 Tt = T5 * Te;
Chris@42 1014 T1c = T3 * Te;
Chris@42 1015 Tw = T2 * Te;
Chris@42 1016 T19 = T6 * Te;
Chris@42 1017 Th = W[6];
Chris@42 1018 TB = T3 * Th;
Chris@42 1019 T14 = T5 * Th;
Chris@42 1020 TG = T6 * Th;
Chris@42 1021 TZ = T2 * Th;
Chris@42 1022 Tl = W[7];
Chris@42 1023 TC = T6 * Tl;
Chris@42 1024 T13 = T2 * Tl;
Chris@42 1025 TF = T3 * Tl;
Chris@42 1026 T10 = T5 * Tl;
Chris@42 1027 }
Chris@42 1028 TD = TB + TC;
Chris@42 1029 TH = TF - TG;
Chris@42 1030 T1y = TZ + T10;
Chris@42 1031 T1H = TF + TG;
Chris@42 1032 T15 = T13 + T14;
Chris@42 1033 T1A = T13 - T14;
Chris@42 1034 T11 = TZ - T10;
Chris@42 1035 T1F = TB - TC;
Chris@42 1036 T1n = FMA(T9, Th, Te * Tl);
Chris@42 1037 T1p = FNMS(Te, Th, T9 * Tl);
Chris@42 1038 {
Chris@42 1039 E T2o, T2p, T2s, T2t;
Chris@42 1040 T2o = T8 * Th;
Chris@42 1041 T2p = Td * Tl;
Chris@42 1042 T2q = T2o + T2p;
Chris@42 1043 T2I = T2o - T2p;
Chris@42 1044 T2s = T8 * Tl;
Chris@42 1045 T2t = Td * Th;
Chris@42 1046 T2u = T2s - T2t;
Chris@42 1047 T2K = T2s + T2t;
Chris@42 1048 }
Chris@42 1049 {
Chris@42 1050 E T2T, T2U, T2X, T2Y;
Chris@42 1051 T2T = TM * Th;
Chris@42 1052 T2U = TO * Tl;
Chris@42 1053 T2V = T2T - T2U;
Chris@42 1054 T3b = T2T + T2U;
Chris@42 1055 T2X = TM * Tl;
Chris@42 1056 T2Y = TO * Th;
Chris@42 1057 T2Z = T2X + T2Y;
Chris@42 1058 T3d = T2X - T2Y;
Chris@42 1059 Tu = Ts + Tt;
Chris@42 1060 Ty = Tw - Tx;
Chris@42 1061 T3l = FMA(Tu, Th, Ty * Tl);
Chris@42 1062 T3n = FNMS(Ty, Th, Tu * Tl);
Chris@42 1063 }
Chris@42 1064 T1t = Ts - Tt;
Chris@42 1065 T1v = Tw + Tx;
Chris@42 1066 T2f = FMA(T1t, Th, T1v * Tl);
Chris@42 1067 T2h = FNMS(T1v, Th, T1t * Tl);
Chris@42 1068 T1a = T18 - T19;
Chris@42 1069 T1e = T1c + T1d;
Chris@42 1070 T32 = FMA(T1a, Th, T1e * Tl);
Chris@42 1071 T34 = FNMS(T1e, Th, T1a * Tl);
Chris@42 1072 T1W = T18 + T19;
Chris@42 1073 T1Y = T1c - T1d;
Chris@42 1074 T2C = FMA(T1W, Th, T1Y * Tl);
Chris@42 1075 T2E = FNMS(T1Y, Th, T1W * Tl);
Chris@42 1076 {
Chris@42 1077 E Ta, Tf, Ti, Tj;
Chris@42 1078 Ta = T8 * T9;
Chris@42 1079 Tf = Td * Te;
Chris@42 1080 Tg = Ta - Tf;
Chris@42 1081 TR = Ta + Tf;
Chris@42 1082 Ti = T8 * Te;
Chris@42 1083 Tj = Td * T9;
Chris@42 1084 Tk = Ti + Tj;
Chris@42 1085 TS = Ti - Tj;
Chris@42 1086 }
Chris@42 1087 Tm = FMA(Tg, Th, Tk * Tl);
Chris@42 1088 TV = FNMS(TS, Th, TR * Tl);
Chris@42 1089 To = FNMS(Tk, Th, Tg * Tl);
Chris@42 1090 TT = FMA(TR, Th, TS * Tl);
Chris@42 1091 {
Chris@42 1092 E T1K, T1L, T1N, T1O;
Chris@42 1093 T1K = TM * T9;
Chris@42 1094 T1L = TO * Te;
Chris@42 1095 T1M = T1K - T1L;
Chris@42 1096 T21 = T1K + T1L;
Chris@42 1097 T1N = TM * Te;
Chris@42 1098 T1O = TO * T9;
Chris@42 1099 T1P = T1N + T1O;
Chris@42 1100 T22 = T1N - T1O;
Chris@42 1101 }
Chris@42 1102 T1Q = FMA(T1M, Th, T1P * Tl);
Chris@42 1103 T25 = FNMS(T22, Th, T21 * Tl);
Chris@42 1104 T1S = FNMS(T1P, Th, T1M * Tl);
Chris@42 1105 T23 = FMA(T21, Th, T22 * Tl);
Chris@42 1106 }
Chris@42 1107 {
Chris@42 1108 E TL, T6f, T8c, T8q, T3F, T5t, T7I, T7W, T2y, T6B, T6y, T7j, T4k, T5G, T4B;
Chris@42 1109 E T5J, T3h, T6H, T6O, T7o, T4L, T5Q, T52, T5N, T1i, T7V, T6i, T7D, T3K, T5u;
Chris@42 1110 E T3P, T5v, T1E, T6k, T6n, T7f, T3W, T5z, T41, T5y, T29, T6p, T6s, T7e, T47;
Chris@42 1111 E T5C, T4c, T5B, T2R, T6z, T6E, T7k, T4v, T5K, T4E, T5H, T3y, T6P, T6K, T7p;
Chris@42 1112 E T4W, T5O, T55, T5R;
Chris@42 1113 {
Chris@42 1114 E T1, T7G, Tq, T7F, TA, T3C, TJ, T3D, Tn, Tp;
Chris@42 1115 T1 = cr[0];
Chris@42 1116 T7G = ci[0];
Chris@42 1117 Tn = cr[WS(rs, 16)];
Chris@42 1118 Tp = ci[WS(rs, 16)];
Chris@42 1119 Tq = FMA(Tm, Tn, To * Tp);
Chris@42 1120 T7F = FNMS(To, Tn, Tm * Tp);
Chris@42 1121 {
Chris@42 1122 E Tv, Tz, TE, TI;
Chris@42 1123 Tv = cr[WS(rs, 8)];
Chris@42 1124 Tz = ci[WS(rs, 8)];
Chris@42 1125 TA = FMA(Tu, Tv, Ty * Tz);
Chris@42 1126 T3C = FNMS(Ty, Tv, Tu * Tz);
Chris@42 1127 TE = cr[WS(rs, 24)];
Chris@42 1128 TI = ci[WS(rs, 24)];
Chris@42 1129 TJ = FMA(TD, TE, TH * TI);
Chris@42 1130 T3D = FNMS(TH, TE, TD * TI);
Chris@42 1131 }
Chris@42 1132 {
Chris@42 1133 E Tr, TK, T8a, T8b;
Chris@42 1134 Tr = T1 + Tq;
Chris@42 1135 TK = TA + TJ;
Chris@42 1136 TL = Tr + TK;
Chris@42 1137 T6f = Tr - TK;
Chris@42 1138 T8a = TA - TJ;
Chris@42 1139 T8b = T7G - T7F;
Chris@42 1140 T8c = T8a + T8b;
Chris@42 1141 T8q = T8b - T8a;
Chris@42 1142 }
Chris@42 1143 {
Chris@42 1144 E T3B, T3E, T7E, T7H;
Chris@42 1145 T3B = T1 - Tq;
Chris@42 1146 T3E = T3C - T3D;
Chris@42 1147 T3F = T3B + T3E;
Chris@42 1148 T5t = T3B - T3E;
Chris@42 1149 T7E = T3C + T3D;
Chris@42 1150 T7H = T7F + T7G;
Chris@42 1151 T7I = T7E + T7H;
Chris@42 1152 T7W = T7H - T7E;
Chris@42 1153 }
Chris@42 1154 }
Chris@42 1155 {
Chris@42 1156 E T2e, T4x, T2w, T4i, T2j, T4y, T2n, T4h;
Chris@42 1157 {
Chris@42 1158 E T2c, T2d, T2r, T2v;
Chris@42 1159 T2c = cr[WS(rs, 1)];
Chris@42 1160 T2d = ci[WS(rs, 1)];
Chris@42 1161 T2e = FMA(T2, T2c, T5 * T2d);
Chris@42 1162 T4x = FNMS(T5, T2c, T2 * T2d);
Chris@42 1163 T2r = cr[WS(rs, 25)];
Chris@42 1164 T2v = ci[WS(rs, 25)];
Chris@42 1165 T2w = FMA(T2q, T2r, T2u * T2v);
Chris@42 1166 T4i = FNMS(T2u, T2r, T2q * T2v);
Chris@42 1167 }
Chris@42 1168 {
Chris@42 1169 E T2g, T2i, T2l, T2m;
Chris@42 1170 T2g = cr[WS(rs, 17)];
Chris@42 1171 T2i = ci[WS(rs, 17)];
Chris@42 1172 T2j = FMA(T2f, T2g, T2h * T2i);
Chris@42 1173 T4y = FNMS(T2h, T2g, T2f * T2i);
Chris@42 1174 T2l = cr[WS(rs, 9)];
Chris@42 1175 T2m = ci[WS(rs, 9)];
Chris@42 1176 T2n = FMA(T9, T2l, Te * T2m);
Chris@42 1177 T4h = FNMS(Te, T2l, T9 * T2m);
Chris@42 1178 }
Chris@42 1179 {
Chris@42 1180 E T2k, T2x, T6w, T6x;
Chris@42 1181 T2k = T2e + T2j;
Chris@42 1182 T2x = T2n + T2w;
Chris@42 1183 T2y = T2k + T2x;
Chris@42 1184 T6B = T2k - T2x;
Chris@42 1185 T6w = T4x + T4y;
Chris@42 1186 T6x = T4h + T4i;
Chris@42 1187 T6y = T6w - T6x;
Chris@42 1188 T7j = T6w + T6x;
Chris@42 1189 }
Chris@42 1190 {
Chris@42 1191 E T4g, T4j, T4z, T4A;
Chris@42 1192 T4g = T2e - T2j;
Chris@42 1193 T4j = T4h - T4i;
Chris@42 1194 T4k = T4g + T4j;
Chris@42 1195 T5G = T4g - T4j;
Chris@42 1196 T4z = T4x - T4y;
Chris@42 1197 T4A = T2n - T2w;
Chris@42 1198 T4B = T4z - T4A;
Chris@42 1199 T5J = T4z + T4A;
Chris@42 1200 }
Chris@42 1201 }
Chris@42 1202 {
Chris@42 1203 E T31, T4H, T3f, T50, T36, T4I, T3a, T4Z;
Chris@42 1204 {
Chris@42 1205 E T2W, T30, T3c, T3e;
Chris@42 1206 T2W = cr[WS(rs, 31)];
Chris@42 1207 T30 = ci[WS(rs, 31)];
Chris@42 1208 T31 = FMA(T2V, T2W, T2Z * T30);
Chris@42 1209 T4H = FNMS(T2Z, T2W, T2V * T30);
Chris@42 1210 T3c = cr[WS(rs, 23)];
Chris@42 1211 T3e = ci[WS(rs, 23)];
Chris@42 1212 T3f = FMA(T3b, T3c, T3d * T3e);
Chris@42 1213 T50 = FNMS(T3d, T3c, T3b * T3e);
Chris@42 1214 }
Chris@42 1215 {
Chris@42 1216 E T33, T35, T38, T39;
Chris@42 1217 T33 = cr[WS(rs, 15)];
Chris@42 1218 T35 = ci[WS(rs, 15)];
Chris@42 1219 T36 = FMA(T32, T33, T34 * T35);
Chris@42 1220 T4I = FNMS(T34, T33, T32 * T35);
Chris@42 1221 T38 = cr[WS(rs, 7)];
Chris@42 1222 T39 = ci[WS(rs, 7)];
Chris@42 1223 T3a = FMA(TR, T38, TS * T39);
Chris@42 1224 T4Z = FNMS(TS, T38, TR * T39);
Chris@42 1225 }
Chris@42 1226 {
Chris@42 1227 E T37, T3g, T6M, T6N;
Chris@42 1228 T37 = T31 + T36;
Chris@42 1229 T3g = T3a + T3f;
Chris@42 1230 T3h = T37 + T3g;
Chris@42 1231 T6H = T37 - T3g;
Chris@42 1232 T6M = T4H + T4I;
Chris@42 1233 T6N = T4Z + T50;
Chris@42 1234 T6O = T6M - T6N;
Chris@42 1235 T7o = T6M + T6N;
Chris@42 1236 }
Chris@42 1237 {
Chris@42 1238 E T4J, T4K, T4Y, T51;
Chris@42 1239 T4J = T4H - T4I;
Chris@42 1240 T4K = T3a - T3f;
Chris@42 1241 T4L = T4J - T4K;
Chris@42 1242 T5Q = T4J + T4K;
Chris@42 1243 T4Y = T31 - T36;
Chris@42 1244 T51 = T4Z - T50;
Chris@42 1245 T52 = T4Y + T51;
Chris@42 1246 T5N = T4Y - T51;
Chris@42 1247 }
Chris@42 1248 }
Chris@42 1249 {
Chris@42 1250 E TQ, T3H, T1g, T3N, TX, T3I, T17, T3M;
Chris@42 1251 {
Chris@42 1252 E TN, TP, T1b, T1f;
Chris@42 1253 TN = cr[WS(rs, 4)];
Chris@42 1254 TP = ci[WS(rs, 4)];
Chris@42 1255 TQ = FMA(TM, TN, TO * TP);
Chris@42 1256 T3H = FNMS(TO, TN, TM * TP);
Chris@42 1257 T1b = cr[WS(rs, 12)];
Chris@42 1258 T1f = ci[WS(rs, 12)];
Chris@42 1259 T1g = FMA(T1a, T1b, T1e * T1f);
Chris@42 1260 T3N = FNMS(T1e, T1b, T1a * T1f);
Chris@42 1261 }
Chris@42 1262 {
Chris@42 1263 E TU, TW, T12, T16;
Chris@42 1264 TU = cr[WS(rs, 20)];
Chris@42 1265 TW = ci[WS(rs, 20)];
Chris@42 1266 TX = FMA(TT, TU, TV * TW);
Chris@42 1267 T3I = FNMS(TV, TU, TT * TW);
Chris@42 1268 T12 = cr[WS(rs, 28)];
Chris@42 1269 T16 = ci[WS(rs, 28)];
Chris@42 1270 T17 = FMA(T11, T12, T15 * T16);
Chris@42 1271 T3M = FNMS(T15, T12, T11 * T16);
Chris@42 1272 }
Chris@42 1273 {
Chris@42 1274 E TY, T1h, T6g, T6h;
Chris@42 1275 TY = TQ + TX;
Chris@42 1276 T1h = T17 + T1g;
Chris@42 1277 T1i = TY + T1h;
Chris@42 1278 T7V = TY - T1h;
Chris@42 1279 T6g = T3M + T3N;
Chris@42 1280 T6h = T3H + T3I;
Chris@42 1281 T6i = T6g - T6h;
Chris@42 1282 T7D = T6h + T6g;
Chris@42 1283 }
Chris@42 1284 {
Chris@42 1285 E T3G, T3J, T3L, T3O;
Chris@42 1286 T3G = TQ - TX;
Chris@42 1287 T3J = T3H - T3I;
Chris@42 1288 T3K = T3G + T3J;
Chris@42 1289 T5u = T3G - T3J;
Chris@42 1290 T3L = T17 - T1g;
Chris@42 1291 T3O = T3M - T3N;
Chris@42 1292 T3P = T3L - T3O;
Chris@42 1293 T5v = T3L + T3O;
Chris@42 1294 }
Chris@42 1295 }
Chris@42 1296 {
Chris@42 1297 E T1m, T3X, T1C, T3U, T1r, T3Y, T1x, T3T;
Chris@42 1298 {
Chris@42 1299 E T1k, T1l, T1z, T1B;
Chris@42 1300 T1k = cr[WS(rs, 2)];
Chris@42 1301 T1l = ci[WS(rs, 2)];
Chris@42 1302 T1m = FMA(T8, T1k, Td * T1l);
Chris@42 1303 T3X = FNMS(Td, T1k, T8 * T1l);
Chris@42 1304 T1z = cr[WS(rs, 26)];
Chris@42 1305 T1B = ci[WS(rs, 26)];
Chris@42 1306 T1C = FMA(T1y, T1z, T1A * T1B);
Chris@42 1307 T3U = FNMS(T1A, T1z, T1y * T1B);
Chris@42 1308 }
Chris@42 1309 {
Chris@42 1310 E T1o, T1q, T1u, T1w;
Chris@42 1311 T1o = cr[WS(rs, 18)];
Chris@42 1312 T1q = ci[WS(rs, 18)];
Chris@42 1313 T1r = FMA(T1n, T1o, T1p * T1q);
Chris@42 1314 T3Y = FNMS(T1p, T1o, T1n * T1q);
Chris@42 1315 T1u = cr[WS(rs, 10)];
Chris@42 1316 T1w = ci[WS(rs, 10)];
Chris@42 1317 T1x = FMA(T1t, T1u, T1v * T1w);
Chris@42 1318 T3T = FNMS(T1v, T1u, T1t * T1w);
Chris@42 1319 }
Chris@42 1320 {
Chris@42 1321 E T1s, T1D, T6l, T6m;
Chris@42 1322 T1s = T1m + T1r;
Chris@42 1323 T1D = T1x + T1C;
Chris@42 1324 T1E = T1s + T1D;
Chris@42 1325 T6k = T1s - T1D;
Chris@42 1326 T6l = T3X + T3Y;
Chris@42 1327 T6m = T3T + T3U;
Chris@42 1328 T6n = T6l - T6m;
Chris@42 1329 T7f = T6l + T6m;
Chris@42 1330 }
Chris@42 1331 {
Chris@42 1332 E T3S, T3V, T3Z, T40;
Chris@42 1333 T3S = T1m - T1r;
Chris@42 1334 T3V = T3T - T3U;
Chris@42 1335 T3W = T3S + T3V;
Chris@42 1336 T5z = T3S - T3V;
Chris@42 1337 T3Z = T3X - T3Y;
Chris@42 1338 T40 = T1x - T1C;
Chris@42 1339 T41 = T3Z - T40;
Chris@42 1340 T5y = T3Z + T40;
Chris@42 1341 }
Chris@42 1342 }
Chris@42 1343 {
Chris@42 1344 E T1J, T43, T27, T4a, T1U, T44, T20, T49;
Chris@42 1345 {
Chris@42 1346 E T1G, T1I, T24, T26;
Chris@42 1347 T1G = cr[WS(rs, 30)];
Chris@42 1348 T1I = ci[WS(rs, 30)];
Chris@42 1349 T1J = FMA(T1F, T1G, T1H * T1I);
Chris@42 1350 T43 = FNMS(T1H, T1G, T1F * T1I);
Chris@42 1351 T24 = cr[WS(rs, 22)];
Chris@42 1352 T26 = ci[WS(rs, 22)];
Chris@42 1353 T27 = FMA(T23, T24, T25 * T26);
Chris@42 1354 T4a = FNMS(T25, T24, T23 * T26);
Chris@42 1355 }
Chris@42 1356 {
Chris@42 1357 E T1R, T1T, T1X, T1Z;
Chris@42 1358 T1R = cr[WS(rs, 14)];
Chris@42 1359 T1T = ci[WS(rs, 14)];
Chris@42 1360 T1U = FMA(T1Q, T1R, T1S * T1T);
Chris@42 1361 T44 = FNMS(T1S, T1R, T1Q * T1T);
Chris@42 1362 T1X = cr[WS(rs, 6)];
Chris@42 1363 T1Z = ci[WS(rs, 6)];
Chris@42 1364 T20 = FMA(T1W, T1X, T1Y * T1Z);
Chris@42 1365 T49 = FNMS(T1Y, T1X, T1W * T1Z);
Chris@42 1366 }
Chris@42 1367 {
Chris@42 1368 E T1V, T28, T6q, T6r;
Chris@42 1369 T1V = T1J + T1U;
Chris@42 1370 T28 = T20 + T27;
Chris@42 1371 T29 = T1V + T28;
Chris@42 1372 T6p = T1V - T28;
Chris@42 1373 T6q = T43 + T44;
Chris@42 1374 T6r = T49 + T4a;
Chris@42 1375 T6s = T6q - T6r;
Chris@42 1376 T7e = T6q + T6r;
Chris@42 1377 }
Chris@42 1378 {
Chris@42 1379 E T45, T46, T48, T4b;
Chris@42 1380 T45 = T43 - T44;
Chris@42 1381 T46 = T20 - T27;
Chris@42 1382 T47 = T45 - T46;
Chris@42 1383 T5C = T45 + T46;
Chris@42 1384 T48 = T1J - T1U;
Chris@42 1385 T4b = T49 - T4a;
Chris@42 1386 T4c = T48 + T4b;
Chris@42 1387 T5B = T48 - T4b;
Chris@42 1388 }
Chris@42 1389 }
Chris@42 1390 {
Chris@42 1391 E T2B, T4m, T2G, T4n, T4l, T4o, T2M, T4q, T2P, T4r, T4s, T4t;
Chris@42 1392 {
Chris@42 1393 E T2z, T2A, T2D, T2F;
Chris@42 1394 T2z = cr[WS(rs, 5)];
Chris@42 1395 T2A = ci[WS(rs, 5)];
Chris@42 1396 T2B = FMA(T21, T2z, T22 * T2A);
Chris@42 1397 T4m = FNMS(T22, T2z, T21 * T2A);
Chris@42 1398 T2D = cr[WS(rs, 21)];
Chris@42 1399 T2F = ci[WS(rs, 21)];
Chris@42 1400 T2G = FMA(T2C, T2D, T2E * T2F);
Chris@42 1401 T4n = FNMS(T2E, T2D, T2C * T2F);
Chris@42 1402 }
Chris@42 1403 T4l = T2B - T2G;
Chris@42 1404 T4o = T4m - T4n;
Chris@42 1405 {
Chris@42 1406 E T2J, T2L, T2N, T2O;
Chris@42 1407 T2J = cr[WS(rs, 29)];
Chris@42 1408 T2L = ci[WS(rs, 29)];
Chris@42 1409 T2M = FMA(T2I, T2J, T2K * T2L);
Chris@42 1410 T4q = FNMS(T2K, T2J, T2I * T2L);
Chris@42 1411 T2N = cr[WS(rs, 13)];
Chris@42 1412 T2O = ci[WS(rs, 13)];
Chris@42 1413 T2P = FMA(T1M, T2N, T1P * T2O);
Chris@42 1414 T4r = FNMS(T1P, T2N, T1M * T2O);
Chris@42 1415 }
Chris@42 1416 T4s = T4q - T4r;
Chris@42 1417 T4t = T2M - T2P;
Chris@42 1418 {
Chris@42 1419 E T2H, T2Q, T6C, T6D;
Chris@42 1420 T2H = T2B + T2G;
Chris@42 1421 T2Q = T2M + T2P;
Chris@42 1422 T2R = T2H + T2Q;
Chris@42 1423 T6z = T2H - T2Q;
Chris@42 1424 T6C = T4q + T4r;
Chris@42 1425 T6D = T4m + T4n;
Chris@42 1426 T6E = T6C - T6D;
Chris@42 1427 T7k = T6D + T6C;
Chris@42 1428 }
Chris@42 1429 {
Chris@42 1430 E T4p, T4u, T4C, T4D;
Chris@42 1431 T4p = T4l + T4o;
Chris@42 1432 T4u = T4s - T4t;
Chris@42 1433 T4v = KP707106781 * (T4p - T4u);
Chris@42 1434 T5K = KP707106781 * (T4p + T4u);
Chris@42 1435 T4C = T4t + T4s;
Chris@42 1436 T4D = T4l - T4o;
Chris@42 1437 T4E = KP707106781 * (T4C - T4D);
Chris@42 1438 T5H = KP707106781 * (T4D + T4C);
Chris@42 1439 }
Chris@42 1440 }
Chris@42 1441 {
Chris@42 1442 E T3k, T4S, T3p, T4T, T4R, T4U, T3t, T4N, T3w, T4O, T4M, T4P;
Chris@42 1443 {
Chris@42 1444 E T3i, T3j, T3m, T3o;
Chris@42 1445 T3i = cr[WS(rs, 3)];
Chris@42 1446 T3j = ci[WS(rs, 3)];
Chris@42 1447 T3k = FMA(T3, T3i, T6 * T3j);
Chris@42 1448 T4S = FNMS(T6, T3i, T3 * T3j);
Chris@42 1449 T3m = cr[WS(rs, 19)];
Chris@42 1450 T3o = ci[WS(rs, 19)];
Chris@42 1451 T3p = FMA(T3l, T3m, T3n * T3o);
Chris@42 1452 T4T = FNMS(T3n, T3m, T3l * T3o);
Chris@42 1453 }
Chris@42 1454 T4R = T3k - T3p;
Chris@42 1455 T4U = T4S - T4T;
Chris@42 1456 {
Chris@42 1457 E T3r, T3s, T3u, T3v;
Chris@42 1458 T3r = cr[WS(rs, 27)];
Chris@42 1459 T3s = ci[WS(rs, 27)];
Chris@42 1460 T3t = FMA(Th, T3r, Tl * T3s);
Chris@42 1461 T4N = FNMS(Tl, T3r, Th * T3s);
Chris@42 1462 T3u = cr[WS(rs, 11)];
Chris@42 1463 T3v = ci[WS(rs, 11)];
Chris@42 1464 T3w = FMA(Tg, T3u, Tk * T3v);
Chris@42 1465 T4O = FNMS(Tk, T3u, Tg * T3v);
Chris@42 1466 }
Chris@42 1467 T4M = T3t - T3w;
Chris@42 1468 T4P = T4N - T4O;
Chris@42 1469 {
Chris@42 1470 E T3q, T3x, T6I, T6J;
Chris@42 1471 T3q = T3k + T3p;
Chris@42 1472 T3x = T3t + T3w;
Chris@42 1473 T3y = T3q + T3x;
Chris@42 1474 T6P = T3q - T3x;
Chris@42 1475 T6I = T4N + T4O;
Chris@42 1476 T6J = T4S + T4T;
Chris@42 1477 T6K = T6I - T6J;
Chris@42 1478 T7p = T6J + T6I;
Chris@42 1479 }
Chris@42 1480 {
Chris@42 1481 E T4Q, T4V, T53, T54;
Chris@42 1482 T4Q = T4M + T4P;
Chris@42 1483 T4V = T4R - T4U;
Chris@42 1484 T4W = KP707106781 * (T4Q - T4V);
Chris@42 1485 T5O = KP707106781 * (T4V + T4Q);
Chris@42 1486 T53 = T4R + T4U;
Chris@42 1487 T54 = T4P - T4M;
Chris@42 1488 T55 = KP707106781 * (T53 - T54);
Chris@42 1489 T5R = KP707106781 * (T53 + T54);
Chris@42 1490 }
Chris@42 1491 }
Chris@42 1492 {
Chris@42 1493 E T2b, T7x, T7K, T7M, T3A, T7L, T7A, T7B;
Chris@42 1494 {
Chris@42 1495 E T1j, T2a, T7C, T7J;
Chris@42 1496 T1j = TL + T1i;
Chris@42 1497 T2a = T1E + T29;
Chris@42 1498 T2b = T1j + T2a;
Chris@42 1499 T7x = T1j - T2a;
Chris@42 1500 T7C = T7f + T7e;
Chris@42 1501 T7J = T7D + T7I;
Chris@42 1502 T7K = T7C + T7J;
Chris@42 1503 T7M = T7J - T7C;
Chris@42 1504 }
Chris@42 1505 {
Chris@42 1506 E T2S, T3z, T7y, T7z;
Chris@42 1507 T2S = T2y + T2R;
Chris@42 1508 T3z = T3h + T3y;
Chris@42 1509 T3A = T2S + T3z;
Chris@42 1510 T7L = T3z - T2S;
Chris@42 1511 T7y = T7o + T7p;
Chris@42 1512 T7z = T7j + T7k;
Chris@42 1513 T7A = T7y - T7z;
Chris@42 1514 T7B = T7z + T7y;
Chris@42 1515 }
Chris@42 1516 ci[WS(rs, 15)] = T2b - T3A;
Chris@42 1517 cr[WS(rs, 24)] = T7L - T7M;
Chris@42 1518 ci[WS(rs, 23)] = T7L + T7M;
Chris@42 1519 cr[0] = T2b + T3A;
Chris@42 1520 cr[WS(rs, 8)] = T7x - T7A;
Chris@42 1521 cr[WS(rs, 16)] = T7B - T7K;
Chris@42 1522 ci[WS(rs, 31)] = T7B + T7K;
Chris@42 1523 ci[WS(rs, 7)] = T7x + T7A;
Chris@42 1524 }
Chris@42 1525 {
Chris@42 1526 E T5x, T5Z, T8d, T8j, T5E, T88, T69, T6d, T5M, T5W, T62, T8i, T66, T6c, T5T;
Chris@42 1527 E T5X, T5w, T89;
Chris@42 1528 T5w = KP707106781 * (T5u + T5v);
Chris@42 1529 T5x = T5t - T5w;
Chris@42 1530 T5Z = T5t + T5w;
Chris@42 1531 T89 = KP707106781 * (T3K - T3P);
Chris@42 1532 T8d = T89 + T8c;
Chris@42 1533 T8j = T8c - T89;
Chris@42 1534 {
Chris@42 1535 E T5A, T5D, T67, T68;
Chris@42 1536 T5A = FMA(KP923879532, T5y, KP382683432 * T5z);
Chris@42 1537 T5D = FNMS(KP923879532, T5C, KP382683432 * T5B);
Chris@42 1538 T5E = T5A + T5D;
Chris@42 1539 T88 = T5A - T5D;
Chris@42 1540 T67 = T5N + T5O;
Chris@42 1541 T68 = T5Q + T5R;
Chris@42 1542 T69 = FNMS(KP980785280, T68, KP195090322 * T67);
Chris@42 1543 T6d = FMA(KP980785280, T67, KP195090322 * T68);
Chris@42 1544 }
Chris@42 1545 {
Chris@42 1546 E T5I, T5L, T60, T61;
Chris@42 1547 T5I = T5G - T5H;
Chris@42 1548 T5L = T5J - T5K;
Chris@42 1549 T5M = FMA(KP831469612, T5I, KP555570233 * T5L);
Chris@42 1550 T5W = FNMS(KP831469612, T5L, KP555570233 * T5I);
Chris@42 1551 T60 = FNMS(KP382683432, T5y, KP923879532 * T5z);
Chris@42 1552 T61 = FMA(KP382683432, T5C, KP923879532 * T5B);
Chris@42 1553 T62 = T60 + T61;
Chris@42 1554 T8i = T61 - T60;
Chris@42 1555 }
Chris@42 1556 {
Chris@42 1557 E T64, T65, T5P, T5S;
Chris@42 1558 T64 = T5G + T5H;
Chris@42 1559 T65 = T5J + T5K;
Chris@42 1560 T66 = FMA(KP195090322, T64, KP980785280 * T65);
Chris@42 1561 T6c = FNMS(KP195090322, T65, KP980785280 * T64);
Chris@42 1562 T5P = T5N - T5O;
Chris@42 1563 T5S = T5Q - T5R;
Chris@42 1564 T5T = FNMS(KP555570233, T5S, KP831469612 * T5P);
Chris@42 1565 T5X = FMA(KP555570233, T5P, KP831469612 * T5S);
Chris@42 1566 }
Chris@42 1567 {
Chris@42 1568 E T5F, T5U, T8h, T8k;
Chris@42 1569 T5F = T5x + T5E;
Chris@42 1570 T5U = T5M + T5T;
Chris@42 1571 ci[WS(rs, 12)] = T5F - T5U;
Chris@42 1572 cr[WS(rs, 3)] = T5F + T5U;
Chris@42 1573 T8h = T5X - T5W;
Chris@42 1574 T8k = T8i + T8j;
Chris@42 1575 cr[WS(rs, 19)] = T8h - T8k;
Chris@42 1576 ci[WS(rs, 28)] = T8h + T8k;
Chris@42 1577 }
Chris@42 1578 {
Chris@42 1579 E T8l, T8m, T5V, T5Y;
Chris@42 1580 T8l = T5T - T5M;
Chris@42 1581 T8m = T8j - T8i;
Chris@42 1582 cr[WS(rs, 27)] = T8l - T8m;
Chris@42 1583 ci[WS(rs, 20)] = T8l + T8m;
Chris@42 1584 T5V = T5x - T5E;
Chris@42 1585 T5Y = T5W + T5X;
Chris@42 1586 cr[WS(rs, 11)] = T5V - T5Y;
Chris@42 1587 ci[WS(rs, 4)] = T5V + T5Y;
Chris@42 1588 }
Chris@42 1589 {
Chris@42 1590 E T63, T6a, T87, T8e;
Chris@42 1591 T63 = T5Z - T62;
Chris@42 1592 T6a = T66 + T69;
Chris@42 1593 ci[WS(rs, 8)] = T63 - T6a;
Chris@42 1594 cr[WS(rs, 7)] = T63 + T6a;
Chris@42 1595 T87 = T69 - T66;
Chris@42 1596 T8e = T88 + T8d;
Chris@42 1597 cr[WS(rs, 31)] = T87 - T8e;
Chris@42 1598 ci[WS(rs, 16)] = T87 + T8e;
Chris@42 1599 }
Chris@42 1600 {
Chris@42 1601 E T8f, T8g, T6b, T6e;
Chris@42 1602 T8f = T6d - T6c;
Chris@42 1603 T8g = T8d - T88;
Chris@42 1604 cr[WS(rs, 23)] = T8f - T8g;
Chris@42 1605 ci[WS(rs, 24)] = T8f + T8g;
Chris@42 1606 T6b = T5Z + T62;
Chris@42 1607 T6e = T6c + T6d;
Chris@42 1608 cr[WS(rs, 15)] = T6b - T6e;
Chris@42 1609 ci[0] = T6b + T6e;
Chris@42 1610 }
Chris@42 1611 }
Chris@42 1612 {
Chris@42 1613 E T7h, T7t, T7Q, T7S, T7m, T7u, T7r, T7v;
Chris@42 1614 {
Chris@42 1615 E T7d, T7g, T7O, T7P;
Chris@42 1616 T7d = TL - T1i;
Chris@42 1617 T7g = T7e - T7f;
Chris@42 1618 T7h = T7d - T7g;
Chris@42 1619 T7t = T7d + T7g;
Chris@42 1620 T7O = T1E - T29;
Chris@42 1621 T7P = T7I - T7D;
Chris@42 1622 T7Q = T7O + T7P;
Chris@42 1623 T7S = T7P - T7O;
Chris@42 1624 }
Chris@42 1625 {
Chris@42 1626 E T7i, T7l, T7n, T7q;
Chris@42 1627 T7i = T2y - T2R;
Chris@42 1628 T7l = T7j - T7k;
Chris@42 1629 T7m = T7i + T7l;
Chris@42 1630 T7u = T7i - T7l;
Chris@42 1631 T7n = T3h - T3y;
Chris@42 1632 T7q = T7o - T7p;
Chris@42 1633 T7r = T7n - T7q;
Chris@42 1634 T7v = T7n + T7q;
Chris@42 1635 }
Chris@42 1636 {
Chris@42 1637 E T7s, T7R, T7w, T7N;
Chris@42 1638 T7s = KP707106781 * (T7m + T7r);
Chris@42 1639 ci[WS(rs, 11)] = T7h - T7s;
Chris@42 1640 cr[WS(rs, 4)] = T7h + T7s;
Chris@42 1641 T7R = KP707106781 * (T7v - T7u);
Chris@42 1642 cr[WS(rs, 20)] = T7R - T7S;
Chris@42 1643 ci[WS(rs, 27)] = T7R + T7S;
Chris@42 1644 T7w = KP707106781 * (T7u + T7v);
Chris@42 1645 cr[WS(rs, 12)] = T7t - T7w;
Chris@42 1646 ci[WS(rs, 3)] = T7t + T7w;
Chris@42 1647 T7N = KP707106781 * (T7r - T7m);
Chris@42 1648 cr[WS(rs, 28)] = T7N - T7Q;
Chris@42 1649 ci[WS(rs, 19)] = T7N + T7Q;
Chris@42 1650 }
Chris@42 1651 }
Chris@42 1652 {
Chris@42 1653 E T6j, T7X, T83, T6X, T6u, T7U, T77, T7b, T70, T82, T6G, T6U, T74, T7a, T6R;
Chris@42 1654 E T6V;
Chris@42 1655 {
Chris@42 1656 E T6o, T6t, T6A, T6F;
Chris@42 1657 T6j = T6f - T6i;
Chris@42 1658 T7X = T7V + T7W;
Chris@42 1659 T83 = T7W - T7V;
Chris@42 1660 T6X = T6f + T6i;
Chris@42 1661 T6o = T6k + T6n;
Chris@42 1662 T6t = T6p - T6s;
Chris@42 1663 T6u = KP707106781 * (T6o + T6t);
Chris@42 1664 T7U = KP707106781 * (T6o - T6t);
Chris@42 1665 {
Chris@42 1666 E T75, T76, T6Y, T6Z;
Chris@42 1667 T75 = T6O + T6P;
Chris@42 1668 T76 = T6H + T6K;
Chris@42 1669 T77 = FMA(KP382683432, T75, KP923879532 * T76);
Chris@42 1670 T7b = FNMS(KP923879532, T75, KP382683432 * T76);
Chris@42 1671 T6Y = T6k - T6n;
Chris@42 1672 T6Z = T6p + T6s;
Chris@42 1673 T70 = KP707106781 * (T6Y + T6Z);
Chris@42 1674 T82 = KP707106781 * (T6Z - T6Y);
Chris@42 1675 }
Chris@42 1676 T6A = T6y - T6z;
Chris@42 1677 T6F = T6B - T6E;
Chris@42 1678 T6G = FMA(KP382683432, T6A, KP923879532 * T6F);
Chris@42 1679 T6U = FNMS(KP923879532, T6A, KP382683432 * T6F);
Chris@42 1680 {
Chris@42 1681 E T72, T73, T6L, T6Q;
Chris@42 1682 T72 = T6B + T6E;
Chris@42 1683 T73 = T6y + T6z;
Chris@42 1684 T74 = FNMS(KP382683432, T73, KP923879532 * T72);
Chris@42 1685 T7a = FMA(KP923879532, T73, KP382683432 * T72);
Chris@42 1686 T6L = T6H - T6K;
Chris@42 1687 T6Q = T6O - T6P;
Chris@42 1688 T6R = FNMS(KP382683432, T6Q, KP923879532 * T6L);
Chris@42 1689 T6V = FMA(KP923879532, T6Q, KP382683432 * T6L);
Chris@42 1690 }
Chris@42 1691 }
Chris@42 1692 {
Chris@42 1693 E T6v, T6S, T81, T84;
Chris@42 1694 T6v = T6j + T6u;
Chris@42 1695 T6S = T6G + T6R;
Chris@42 1696 ci[WS(rs, 13)] = T6v - T6S;
Chris@42 1697 cr[WS(rs, 2)] = T6v + T6S;
Chris@42 1698 T81 = T6V - T6U;
Chris@42 1699 T84 = T82 + T83;
Chris@42 1700 cr[WS(rs, 18)] = T81 - T84;
Chris@42 1701 ci[WS(rs, 29)] = T81 + T84;
Chris@42 1702 }
Chris@42 1703 {
Chris@42 1704 E T85, T86, T6T, T6W;
Chris@42 1705 T85 = T6R - T6G;
Chris@42 1706 T86 = T83 - T82;
Chris@42 1707 cr[WS(rs, 26)] = T85 - T86;
Chris@42 1708 ci[WS(rs, 21)] = T85 + T86;
Chris@42 1709 T6T = T6j - T6u;
Chris@42 1710 T6W = T6U + T6V;
Chris@42 1711 cr[WS(rs, 10)] = T6T - T6W;
Chris@42 1712 ci[WS(rs, 5)] = T6T + T6W;
Chris@42 1713 }
Chris@42 1714 {
Chris@42 1715 E T71, T78, T7T, T7Y;
Chris@42 1716 T71 = T6X + T70;
Chris@42 1717 T78 = T74 + T77;
Chris@42 1718 cr[WS(rs, 14)] = T71 - T78;
Chris@42 1719 ci[WS(rs, 1)] = T71 + T78;
Chris@42 1720 T7T = T7b - T7a;
Chris@42 1721 T7Y = T7U + T7X;
Chris@42 1722 cr[WS(rs, 30)] = T7T - T7Y;
Chris@42 1723 ci[WS(rs, 17)] = T7T + T7Y;
Chris@42 1724 }
Chris@42 1725 {
Chris@42 1726 E T7Z, T80, T79, T7c;
Chris@42 1727 T7Z = T77 - T74;
Chris@42 1728 T80 = T7X - T7U;
Chris@42 1729 cr[WS(rs, 22)] = T7Z - T80;
Chris@42 1730 ci[WS(rs, 25)] = T7Z + T80;
Chris@42 1731 T79 = T6X - T70;
Chris@42 1732 T7c = T7a + T7b;
Chris@42 1733 ci[WS(rs, 9)] = T79 - T7c;
Chris@42 1734 cr[WS(rs, 6)] = T79 + T7c;
Chris@42 1735 }
Chris@42 1736 }
Chris@42 1737 {
Chris@42 1738 E T3R, T5d, T8r, T8x, T4e, T8o, T5n, T5r, T4G, T5a, T5g, T8w, T5k, T5q, T57;
Chris@42 1739 E T5b, T3Q, T8p;
Chris@42 1740 T3Q = KP707106781 * (T3K + T3P);
Chris@42 1741 T3R = T3F - T3Q;
Chris@42 1742 T5d = T3F + T3Q;
Chris@42 1743 T8p = KP707106781 * (T5v - T5u);
Chris@42 1744 T8r = T8p + T8q;
Chris@42 1745 T8x = T8q - T8p;
Chris@42 1746 {
Chris@42 1747 E T42, T4d, T5l, T5m;
Chris@42 1748 T42 = FNMS(KP923879532, T41, KP382683432 * T3W);
Chris@42 1749 T4d = FMA(KP923879532, T47, KP382683432 * T4c);
Chris@42 1750 T4e = T42 + T4d;
Chris@42 1751 T8o = T4d - T42;
Chris@42 1752 T5l = T52 + T55;
Chris@42 1753 T5m = T4L + T4W;
Chris@42 1754 T5n = FNMS(KP195090322, T5m, KP980785280 * T5l);
Chris@42 1755 T5r = FMA(KP980785280, T5m, KP195090322 * T5l);
Chris@42 1756 }
Chris@42 1757 {
Chris@42 1758 E T4w, T4F, T5e, T5f;
Chris@42 1759 T4w = T4k - T4v;
Chris@42 1760 T4F = T4B - T4E;
Chris@42 1761 T4G = FNMS(KP555570233, T4F, KP831469612 * T4w);
Chris@42 1762 T5a = FMA(KP831469612, T4F, KP555570233 * T4w);
Chris@42 1763 T5e = FMA(KP382683432, T41, KP923879532 * T3W);
Chris@42 1764 T5f = FNMS(KP382683432, T47, KP923879532 * T4c);
Chris@42 1765 T5g = T5e + T5f;
Chris@42 1766 T8w = T5e - T5f;
Chris@42 1767 }
Chris@42 1768 {
Chris@42 1769 E T5i, T5j, T4X, T56;
Chris@42 1770 T5i = T4B + T4E;
Chris@42 1771 T5j = T4k + T4v;
Chris@42 1772 T5k = FMA(KP195090322, T5i, KP980785280 * T5j);
Chris@42 1773 T5q = FNMS(KP980785280, T5i, KP195090322 * T5j);
Chris@42 1774 T4X = T4L - T4W;
Chris@42 1775 T56 = T52 - T55;
Chris@42 1776 T57 = FMA(KP555570233, T4X, KP831469612 * T56);
Chris@42 1777 T5b = FNMS(KP831469612, T4X, KP555570233 * T56);
Chris@42 1778 }
Chris@42 1779 {
Chris@42 1780 E T4f, T58, T8v, T8y;
Chris@42 1781 T4f = T3R + T4e;
Chris@42 1782 T58 = T4G + T57;
Chris@42 1783 cr[WS(rs, 13)] = T4f - T58;
Chris@42 1784 ci[WS(rs, 2)] = T4f + T58;
Chris@42 1785 T8v = T5b - T5a;
Chris@42 1786 T8y = T8w + T8x;
Chris@42 1787 cr[WS(rs, 29)] = T8v - T8y;
Chris@42 1788 ci[WS(rs, 18)] = T8v + T8y;
Chris@42 1789 }
Chris@42 1790 {
Chris@42 1791 E T8z, T8A, T59, T5c;
Chris@42 1792 T8z = T57 - T4G;
Chris@42 1793 T8A = T8x - T8w;
Chris@42 1794 cr[WS(rs, 21)] = T8z - T8A;
Chris@42 1795 ci[WS(rs, 26)] = T8z + T8A;
Chris@42 1796 T59 = T3R - T4e;
Chris@42 1797 T5c = T5a + T5b;
Chris@42 1798 ci[WS(rs, 10)] = T59 - T5c;
Chris@42 1799 cr[WS(rs, 5)] = T59 + T5c;
Chris@42 1800 }
Chris@42 1801 {
Chris@42 1802 E T5h, T5o, T8n, T8s;
Chris@42 1803 T5h = T5d + T5g;
Chris@42 1804 T5o = T5k + T5n;
Chris@42 1805 ci[WS(rs, 14)] = T5h - T5o;
Chris@42 1806 cr[WS(rs, 1)] = T5h + T5o;
Chris@42 1807 T8n = T5r - T5q;
Chris@42 1808 T8s = T8o + T8r;
Chris@42 1809 cr[WS(rs, 17)] = T8n - T8s;
Chris@42 1810 ci[WS(rs, 30)] = T8n + T8s;
Chris@42 1811 }
Chris@42 1812 {
Chris@42 1813 E T8t, T8u, T5p, T5s;
Chris@42 1814 T8t = T5n - T5k;
Chris@42 1815 T8u = T8r - T8o;
Chris@42 1816 cr[WS(rs, 25)] = T8t - T8u;
Chris@42 1817 ci[WS(rs, 22)] = T8t + T8u;
Chris@42 1818 T5p = T5d - T5g;
Chris@42 1819 T5s = T5q + T5r;
Chris@42 1820 cr[WS(rs, 9)] = T5p - T5s;
Chris@42 1821 ci[WS(rs, 6)] = T5p + T5s;
Chris@42 1822 }
Chris@42 1823 }
Chris@42 1824 }
Chris@42 1825 }
Chris@42 1826 }
Chris@42 1827 }
Chris@42 1828
Chris@42 1829 static const tw_instr twinstr[] = {
Chris@42 1830 {TW_CEXP, 1, 1},
Chris@42 1831 {TW_CEXP, 1, 3},
Chris@42 1832 {TW_CEXP, 1, 9},
Chris@42 1833 {TW_CEXP, 1, 27},
Chris@42 1834 {TW_NEXT, 1, 0}
Chris@42 1835 };
Chris@42 1836
Chris@42 1837 static const hc2hc_desc desc = { 32, "hf2_32", twinstr, &GENUS, {376, 168, 112, 0} };
Chris@42 1838
Chris@42 1839 void X(codelet_hf2_32) (planner *p) {
Chris@42 1840 X(khc2hc_register) (p, hf2_32, &desc);
Chris@42 1841 }
Chris@42 1842 #endif /* HAVE_FMA */