annotate src/fftw-3.3.5/rdft/scalar/r2cf/hf_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:46:27 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 32 -dit -name hf_32 -include hf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 434 FP additions, 260 FP multiplications,
Chris@42 32 * (or, 236 additions, 62 multiplications, 198 fused multiply/add),
Chris@42 33 * 135 stack variables, 7 constants, and 128 memory accesses
Chris@42 34 */
Chris@42 35 #include "hf.h"
Chris@42 36
Chris@42 37 static void hf_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 40 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 41 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 42 DK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 43 DK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 44 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 45 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 46 {
Chris@42 47 INT m;
Chris@42 48 for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 49 E T6D, T6A;
Chris@42 50 {
Chris@42 51 E T8y, T87, T8, T3w, T83, T3B, T8x, Tl, T6G, Tz, T3J, T5T, T6F, TM, T3Q;
Chris@42 52 E T5U, T46, T5X, T7E, T6M, T5Y, T3Z, T6J, T1f, T7D, T6R, T61, T4e, T6O, T1G;
Chris@42 53 E T60, T4l, T54, T6c, T7d, T7N, T32, T76, T6f, T5r, T4v, T65, T72, T7I, T29;
Chris@42 54 E T6V, T68, T4S, T5t, T5b, T7O, T79, T7e, T3t, T5s, T5i, T4H, T2y, T4B, T6X;
Chris@42 55 E T2m, T4w, T4F, T2s;
Chris@42 56 {
Chris@42 57 E T44, T1d, T3X, T6K, T11, T40, T42, T17, T5h, T5c;
Chris@42 58 {
Chris@42 59 E Ta, Td, Tg, T3x, Tb, Tj, Tf, Tc, Ti;
Chris@42 60 {
Chris@42 61 E T1, T86, T3, T6, T2, T5;
Chris@42 62 T1 = cr[0];
Chris@42 63 T86 = ci[0];
Chris@42 64 T3 = cr[WS(rs, 16)];
Chris@42 65 T6 = ci[WS(rs, 16)];
Chris@42 66 T2 = W[30];
Chris@42 67 T5 = W[31];
Chris@42 68 {
Chris@42 69 E T84, T4, T9, T85, T7;
Chris@42 70 Ta = cr[WS(rs, 8)];
Chris@42 71 Td = ci[WS(rs, 8)];
Chris@42 72 T84 = T2 * T6;
Chris@42 73 T4 = T2 * T3;
Chris@42 74 T9 = W[14];
Chris@42 75 Tg = cr[WS(rs, 24)];
Chris@42 76 T85 = FNMS(T5, T3, T84);
Chris@42 77 T7 = FMA(T5, T6, T4);
Chris@42 78 T3x = T9 * Td;
Chris@42 79 Tb = T9 * Ta;
Chris@42 80 T8y = T86 - T85;
Chris@42 81 T87 = T85 + T86;
Chris@42 82 T8 = T1 + T7;
Chris@42 83 T3w = T1 - T7;
Chris@42 84 Tj = ci[WS(rs, 24)];
Chris@42 85 Tf = W[46];
Chris@42 86 }
Chris@42 87 Tc = W[15];
Chris@42 88 Ti = W[47];
Chris@42 89 }
Chris@42 90 {
Chris@42 91 E Tu, Tx, T3F, Ts, Tw, T3G, Tv;
Chris@42 92 {
Chris@42 93 E To, Tr, Tp, T3E, Tq, Tt;
Chris@42 94 {
Chris@42 95 E T3y, Te, T3A, Tk, T3z, Th, Tn;
Chris@42 96 To = cr[WS(rs, 4)];
Chris@42 97 T3z = Tf * Tj;
Chris@42 98 Th = Tf * Tg;
Chris@42 99 T3y = FNMS(Tc, Ta, T3x);
Chris@42 100 Te = FMA(Tc, Td, Tb);
Chris@42 101 T3A = FNMS(Ti, Tg, T3z);
Chris@42 102 Tk = FMA(Ti, Tj, Th);
Chris@42 103 Tr = ci[WS(rs, 4)];
Chris@42 104 Tn = W[6];
Chris@42 105 T83 = T3y + T3A;
Chris@42 106 T3B = T3y - T3A;
Chris@42 107 T8x = Te - Tk;
Chris@42 108 Tl = Te + Tk;
Chris@42 109 Tp = Tn * To;
Chris@42 110 T3E = Tn * Tr;
Chris@42 111 }
Chris@42 112 Tq = W[7];
Chris@42 113 Tu = cr[WS(rs, 20)];
Chris@42 114 Tx = ci[WS(rs, 20)];
Chris@42 115 Tt = W[38];
Chris@42 116 T3F = FNMS(Tq, To, T3E);
Chris@42 117 Ts = FMA(Tq, Tr, Tp);
Chris@42 118 Tw = W[39];
Chris@42 119 T3G = Tt * Tx;
Chris@42 120 Tv = Tt * Tu;
Chris@42 121 }
Chris@42 122 {
Chris@42 123 E T3M, TF, TH, TK, TG, TJ, TE, TD, TC;
Chris@42 124 {
Chris@42 125 E TB, T3H, Ty, TA, T3I, T3D, T3L;
Chris@42 126 TB = cr[WS(rs, 28)];
Chris@42 127 TE = ci[WS(rs, 28)];
Chris@42 128 T3H = FNMS(Tw, Tu, T3G);
Chris@42 129 Ty = FMA(Tw, Tx, Tv);
Chris@42 130 TA = W[54];
Chris@42 131 TD = W[55];
Chris@42 132 T6G = T3F + T3H;
Chris@42 133 T3I = T3F - T3H;
Chris@42 134 Tz = Ts + Ty;
Chris@42 135 T3D = Ts - Ty;
Chris@42 136 T3L = TA * TE;
Chris@42 137 TC = TA * TB;
Chris@42 138 T3J = T3D - T3I;
Chris@42 139 T5T = T3D + T3I;
Chris@42 140 T3M = FNMS(TD, TB, T3L);
Chris@42 141 }
Chris@42 142 TF = FMA(TD, TE, TC);
Chris@42 143 TH = cr[WS(rs, 12)];
Chris@42 144 TK = ci[WS(rs, 12)];
Chris@42 145 TG = W[22];
Chris@42 146 TJ = W[23];
Chris@42 147 {
Chris@42 148 E TU, T3U, T13, T16, T3W, T10, T12, T15, T41, T14;
Chris@42 149 {
Chris@42 150 E T19, T1c, T18, T1b, T3P, T3K;
Chris@42 151 {
Chris@42 152 E TQ, TT, T3N, TI, TP, TS;
Chris@42 153 TQ = cr[WS(rs, 2)];
Chris@42 154 TT = ci[WS(rs, 2)];
Chris@42 155 T3N = TG * TK;
Chris@42 156 TI = TG * TH;
Chris@42 157 TP = W[2];
Chris@42 158 TS = W[3];
Chris@42 159 {
Chris@42 160 E T3O, TL, T3T, TR;
Chris@42 161 T3O = FNMS(TJ, TH, T3N);
Chris@42 162 TL = FMA(TJ, TK, TI);
Chris@42 163 T3T = TP * TT;
Chris@42 164 TR = TP * TQ;
Chris@42 165 T6F = T3M + T3O;
Chris@42 166 T3P = T3M - T3O;
Chris@42 167 TM = TF + TL;
Chris@42 168 T3K = TF - TL;
Chris@42 169 TU = FMA(TS, TT, TR);
Chris@42 170 T3U = FNMS(TS, TQ, T3T);
Chris@42 171 }
Chris@42 172 }
Chris@42 173 T3Q = T3K + T3P;
Chris@42 174 T5U = T3K - T3P;
Chris@42 175 T19 = cr[WS(rs, 26)];
Chris@42 176 T1c = ci[WS(rs, 26)];
Chris@42 177 T18 = W[50];
Chris@42 178 T1b = W[51];
Chris@42 179 {
Chris@42 180 E TW, TZ, TY, T3V, TX, T43, T1a, TV;
Chris@42 181 TW = cr[WS(rs, 18)];
Chris@42 182 TZ = ci[WS(rs, 18)];
Chris@42 183 T43 = T18 * T1c;
Chris@42 184 T1a = T18 * T19;
Chris@42 185 TV = W[34];
Chris@42 186 TY = W[35];
Chris@42 187 T44 = FNMS(T1b, T19, T43);
Chris@42 188 T1d = FMA(T1b, T1c, T1a);
Chris@42 189 T3V = TV * TZ;
Chris@42 190 TX = TV * TW;
Chris@42 191 T13 = cr[WS(rs, 10)];
Chris@42 192 T16 = ci[WS(rs, 10)];
Chris@42 193 T3W = FNMS(TY, TW, T3V);
Chris@42 194 T10 = FMA(TY, TZ, TX);
Chris@42 195 T12 = W[18];
Chris@42 196 T15 = W[19];
Chris@42 197 }
Chris@42 198 }
Chris@42 199 T3X = T3U - T3W;
Chris@42 200 T6K = T3U + T3W;
Chris@42 201 T11 = TU + T10;
Chris@42 202 T40 = TU - T10;
Chris@42 203 T41 = T12 * T16;
Chris@42 204 T14 = T12 * T13;
Chris@42 205 T42 = FNMS(T15, T13, T41);
Chris@42 206 T17 = FMA(T15, T16, T14);
Chris@42 207 }
Chris@42 208 }
Chris@42 209 }
Chris@42 210 }
Chris@42 211 {
Chris@42 212 E T49, T1l, T4j, T1E, T1u, T1x, T1w, T4b, T1r, T4g, T1v;
Chris@42 213 {
Chris@42 214 E T1A, T1D, T1C, T4i, T1B;
Chris@42 215 {
Chris@42 216 E T1h, T1k, T1g, T1j, T48, T1i, T1z;
Chris@42 217 T1h = cr[WS(rs, 30)];
Chris@42 218 T1k = ci[WS(rs, 30)];
Chris@42 219 {
Chris@42 220 E T6L, T45, T1e, T3Y;
Chris@42 221 T6L = T42 + T44;
Chris@42 222 T45 = T42 - T44;
Chris@42 223 T1e = T17 + T1d;
Chris@42 224 T3Y = T17 - T1d;
Chris@42 225 T46 = T40 - T45;
Chris@42 226 T5X = T40 + T45;
Chris@42 227 T7E = T6K + T6L;
Chris@42 228 T6M = T6K - T6L;
Chris@42 229 T5Y = T3X - T3Y;
Chris@42 230 T3Z = T3X + T3Y;
Chris@42 231 T6J = T11 - T1e;
Chris@42 232 T1f = T11 + T1e;
Chris@42 233 T1g = W[58];
Chris@42 234 }
Chris@42 235 T1j = W[59];
Chris@42 236 T1A = cr[WS(rs, 22)];
Chris@42 237 T1D = ci[WS(rs, 22)];
Chris@42 238 T48 = T1g * T1k;
Chris@42 239 T1i = T1g * T1h;
Chris@42 240 T1z = W[42];
Chris@42 241 T1C = W[43];
Chris@42 242 T49 = FNMS(T1j, T1h, T48);
Chris@42 243 T1l = FMA(T1j, T1k, T1i);
Chris@42 244 T4i = T1z * T1D;
Chris@42 245 T1B = T1z * T1A;
Chris@42 246 }
Chris@42 247 {
Chris@42 248 E T1n, T1q, T1m, T1p, T4a, T1o, T1t;
Chris@42 249 T1n = cr[WS(rs, 14)];
Chris@42 250 T1q = ci[WS(rs, 14)];
Chris@42 251 T4j = FNMS(T1C, T1A, T4i);
Chris@42 252 T1E = FMA(T1C, T1D, T1B);
Chris@42 253 T1m = W[26];
Chris@42 254 T1p = W[27];
Chris@42 255 T1u = cr[WS(rs, 6)];
Chris@42 256 T1x = ci[WS(rs, 6)];
Chris@42 257 T4a = T1m * T1q;
Chris@42 258 T1o = T1m * T1n;
Chris@42 259 T1t = W[10];
Chris@42 260 T1w = W[11];
Chris@42 261 T4b = FNMS(T1p, T1n, T4a);
Chris@42 262 T1r = FMA(T1p, T1q, T1o);
Chris@42 263 T4g = T1t * T1x;
Chris@42 264 T1v = T1t * T1u;
Chris@42 265 }
Chris@42 266 }
Chris@42 267 {
Chris@42 268 E T4c, T6P, T1s, T4f, T4h, T1y;
Chris@42 269 T4c = T49 - T4b;
Chris@42 270 T6P = T49 + T4b;
Chris@42 271 T1s = T1l + T1r;
Chris@42 272 T4f = T1l - T1r;
Chris@42 273 T4h = FNMS(T1w, T1u, T4g);
Chris@42 274 T1y = FMA(T1w, T1x, T1v);
Chris@42 275 {
Chris@42 276 E T4k, T6Q, T4d, T1F;
Chris@42 277 T4k = T4h - T4j;
Chris@42 278 T6Q = T4h + T4j;
Chris@42 279 T4d = T1y - T1E;
Chris@42 280 T1F = T1y + T1E;
Chris@42 281 T7D = T6P + T6Q;
Chris@42 282 T6R = T6P - T6Q;
Chris@42 283 T61 = T4c - T4d;
Chris@42 284 T4e = T4c + T4d;
Chris@42 285 T6O = T1s - T1F;
Chris@42 286 T1G = T1s + T1F;
Chris@42 287 T60 = T4f + T4k;
Chris@42 288 T4l = T4f - T4k;
Chris@42 289 }
Chris@42 290 }
Chris@42 291 }
Chris@42 292 {
Chris@42 293 E T5n, T2H, T52, T30, T2Q, T2T, T2S, T5p, T2N, T4Z, T2R;
Chris@42 294 {
Chris@42 295 E T2W, T2Z, T2Y, T51, T2X;
Chris@42 296 {
Chris@42 297 E T2D, T2G, T2C, T2F, T5m, T2E, T2V;
Chris@42 298 T2D = cr[WS(rs, 31)];
Chris@42 299 T2G = ci[WS(rs, 31)];
Chris@42 300 T2C = W[60];
Chris@42 301 T2F = W[61];
Chris@42 302 T2W = cr[WS(rs, 23)];
Chris@42 303 T2Z = ci[WS(rs, 23)];
Chris@42 304 T5m = T2C * T2G;
Chris@42 305 T2E = T2C * T2D;
Chris@42 306 T2V = W[44];
Chris@42 307 T2Y = W[45];
Chris@42 308 T5n = FNMS(T2F, T2D, T5m);
Chris@42 309 T2H = FMA(T2F, T2G, T2E);
Chris@42 310 T51 = T2V * T2Z;
Chris@42 311 T2X = T2V * T2W;
Chris@42 312 }
Chris@42 313 {
Chris@42 314 E T2J, T2M, T2I, T2L, T5o, T2K, T2P;
Chris@42 315 T2J = cr[WS(rs, 15)];
Chris@42 316 T2M = ci[WS(rs, 15)];
Chris@42 317 T52 = FNMS(T2Y, T2W, T51);
Chris@42 318 T30 = FMA(T2Y, T2Z, T2X);
Chris@42 319 T2I = W[28];
Chris@42 320 T2L = W[29];
Chris@42 321 T2Q = cr[WS(rs, 7)];
Chris@42 322 T2T = ci[WS(rs, 7)];
Chris@42 323 T5o = T2I * T2M;
Chris@42 324 T2K = T2I * T2J;
Chris@42 325 T2P = W[12];
Chris@42 326 T2S = W[13];
Chris@42 327 T5p = FNMS(T2L, T2J, T5o);
Chris@42 328 T2N = FMA(T2L, T2M, T2K);
Chris@42 329 T4Z = T2P * T2T;
Chris@42 330 T2R = T2P * T2Q;
Chris@42 331 }
Chris@42 332 }
Chris@42 333 {
Chris@42 334 E T5q, T7b, T2O, T4Y, T50, T2U;
Chris@42 335 T5q = T5n - T5p;
Chris@42 336 T7b = T5n + T5p;
Chris@42 337 T2O = T2H + T2N;
Chris@42 338 T4Y = T2H - T2N;
Chris@42 339 T50 = FNMS(T2S, T2Q, T4Z);
Chris@42 340 T2U = FMA(T2S, T2T, T2R);
Chris@42 341 {
Chris@42 342 E T7c, T53, T31, T5l;
Chris@42 343 T7c = T50 + T52;
Chris@42 344 T53 = T50 - T52;
Chris@42 345 T31 = T2U + T30;
Chris@42 346 T5l = T30 - T2U;
Chris@42 347 T54 = T4Y - T53;
Chris@42 348 T6c = T4Y + T53;
Chris@42 349 T7d = T7b - T7c;
Chris@42 350 T7N = T7b + T7c;
Chris@42 351 T32 = T2O + T31;
Chris@42 352 T76 = T2O - T31;
Chris@42 353 T6f = T5q + T5l;
Chris@42 354 T5r = T5l - T5q;
Chris@42 355 }
Chris@42 356 }
Chris@42 357 }
Chris@42 358 {
Chris@42 359 E T4N, T1O, T4t, T27, T1X, T20, T1Z, T4P, T1U, T4q, T1Y;
Chris@42 360 {
Chris@42 361 E T23, T26, T25, T4s, T24;
Chris@42 362 {
Chris@42 363 E T1K, T1N, T1J, T1M, T4M, T1L, T22;
Chris@42 364 T1K = cr[WS(rs, 1)];
Chris@42 365 T1N = ci[WS(rs, 1)];
Chris@42 366 T1J = W[0];
Chris@42 367 T1M = W[1];
Chris@42 368 T23 = cr[WS(rs, 25)];
Chris@42 369 T26 = ci[WS(rs, 25)];
Chris@42 370 T4M = T1J * T1N;
Chris@42 371 T1L = T1J * T1K;
Chris@42 372 T22 = W[48];
Chris@42 373 T25 = W[49];
Chris@42 374 T4N = FNMS(T1M, T1K, T4M);
Chris@42 375 T1O = FMA(T1M, T1N, T1L);
Chris@42 376 T4s = T22 * T26;
Chris@42 377 T24 = T22 * T23;
Chris@42 378 }
Chris@42 379 {
Chris@42 380 E T1Q, T1T, T1P, T1S, T4O, T1R, T1W;
Chris@42 381 T1Q = cr[WS(rs, 17)];
Chris@42 382 T1T = ci[WS(rs, 17)];
Chris@42 383 T4t = FNMS(T25, T23, T4s);
Chris@42 384 T27 = FMA(T25, T26, T24);
Chris@42 385 T1P = W[32];
Chris@42 386 T1S = W[33];
Chris@42 387 T1X = cr[WS(rs, 9)];
Chris@42 388 T20 = ci[WS(rs, 9)];
Chris@42 389 T4O = T1P * T1T;
Chris@42 390 T1R = T1P * T1Q;
Chris@42 391 T1W = W[16];
Chris@42 392 T1Z = W[17];
Chris@42 393 T4P = FNMS(T1S, T1Q, T4O);
Chris@42 394 T1U = FMA(T1S, T1T, T1R);
Chris@42 395 T4q = T1W * T20;
Chris@42 396 T1Y = T1W * T1X;
Chris@42 397 }
Chris@42 398 }
Chris@42 399 {
Chris@42 400 E T4Q, T70, T1V, T4p, T4r, T21;
Chris@42 401 T4Q = T4N - T4P;
Chris@42 402 T70 = T4N + T4P;
Chris@42 403 T1V = T1O + T1U;
Chris@42 404 T4p = T1O - T1U;
Chris@42 405 T4r = FNMS(T1Z, T1X, T4q);
Chris@42 406 T21 = FMA(T1Z, T20, T1Y);
Chris@42 407 {
Chris@42 408 E T71, T4u, T4R, T28;
Chris@42 409 T71 = T4r + T4t;
Chris@42 410 T4u = T4r - T4t;
Chris@42 411 T4R = T21 - T27;
Chris@42 412 T28 = T21 + T27;
Chris@42 413 T4v = T4p - T4u;
Chris@42 414 T65 = T4p + T4u;
Chris@42 415 T72 = T70 - T71;
Chris@42 416 T7I = T70 + T71;
Chris@42 417 T29 = T1V + T28;
Chris@42 418 T6V = T1V - T28;
Chris@42 419 T68 = T4Q - T4R;
Chris@42 420 T4S = T4Q + T4R;
Chris@42 421 }
Chris@42 422 }
Chris@42 423 }
Chris@42 424 {
Chris@42 425 E T57, T38, T5g, T3r, T3h, T3k, T3j, T59, T3e, T5d, T3i;
Chris@42 426 {
Chris@42 427 E T3n, T3q, T3p, T5f, T3o;
Chris@42 428 {
Chris@42 429 E T34, T37, T33, T36, T56, T35, T3m;
Chris@42 430 T34 = cr[WS(rs, 3)];
Chris@42 431 T37 = ci[WS(rs, 3)];
Chris@42 432 T33 = W[4];
Chris@42 433 T36 = W[5];
Chris@42 434 T3n = cr[WS(rs, 11)];
Chris@42 435 T3q = ci[WS(rs, 11)];
Chris@42 436 T56 = T33 * T37;
Chris@42 437 T35 = T33 * T34;
Chris@42 438 T3m = W[20];
Chris@42 439 T3p = W[21];
Chris@42 440 T57 = FNMS(T36, T34, T56);
Chris@42 441 T38 = FMA(T36, T37, T35);
Chris@42 442 T5f = T3m * T3q;
Chris@42 443 T3o = T3m * T3n;
Chris@42 444 }
Chris@42 445 {
Chris@42 446 E T3a, T3d, T39, T3c, T58, T3b, T3g;
Chris@42 447 T3a = cr[WS(rs, 19)];
Chris@42 448 T3d = ci[WS(rs, 19)];
Chris@42 449 T5g = FNMS(T3p, T3n, T5f);
Chris@42 450 T3r = FMA(T3p, T3q, T3o);
Chris@42 451 T39 = W[36];
Chris@42 452 T3c = W[37];
Chris@42 453 T3h = cr[WS(rs, 27)];
Chris@42 454 T3k = ci[WS(rs, 27)];
Chris@42 455 T58 = T39 * T3d;
Chris@42 456 T3b = T39 * T3a;
Chris@42 457 T3g = W[52];
Chris@42 458 T3j = W[53];
Chris@42 459 T59 = FNMS(T3c, T3a, T58);
Chris@42 460 T3e = FMA(T3c, T3d, T3b);
Chris@42 461 T5d = T3g * T3k;
Chris@42 462 T3i = T3g * T3h;
Chris@42 463 }
Chris@42 464 }
Chris@42 465 {
Chris@42 466 E T5a, T78, T3f, T55, T5e, T3l, T77, T3s;
Chris@42 467 T5a = T57 - T59;
Chris@42 468 T78 = T57 + T59;
Chris@42 469 T3f = T38 + T3e;
Chris@42 470 T55 = T38 - T3e;
Chris@42 471 T5e = FNMS(T3j, T3h, T5d);
Chris@42 472 T3l = FMA(T3j, T3k, T3i);
Chris@42 473 T5h = T5e - T5g;
Chris@42 474 T77 = T5e + T5g;
Chris@42 475 T3s = T3l + T3r;
Chris@42 476 T5c = T3l - T3r;
Chris@42 477 T5t = T55 + T5a;
Chris@42 478 T5b = T55 - T5a;
Chris@42 479 T7O = T78 + T77;
Chris@42 480 T79 = T77 - T78;
Chris@42 481 T7e = T3s - T3f;
Chris@42 482 T3t = T3f + T3s;
Chris@42 483 }
Chris@42 484 }
Chris@42 485 {
Chris@42 486 E T4y, T2f, T2o, T2r, T4A, T2l, T2n, T2q, T4E, T2p;
Chris@42 487 {
Chris@42 488 E T2u, T2x, T2t, T2w;
Chris@42 489 {
Chris@42 490 E T2b, T2e, T2d, T4x, T2c, T2a;
Chris@42 491 T2b = cr[WS(rs, 5)];
Chris@42 492 T2e = ci[WS(rs, 5)];
Chris@42 493 T2a = W[8];
Chris@42 494 T5s = T5c - T5h;
Chris@42 495 T5i = T5c + T5h;
Chris@42 496 T2d = W[9];
Chris@42 497 T4x = T2a * T2e;
Chris@42 498 T2c = T2a * T2b;
Chris@42 499 T2u = cr[WS(rs, 13)];
Chris@42 500 T2x = ci[WS(rs, 13)];
Chris@42 501 T4y = FNMS(T2d, T2b, T4x);
Chris@42 502 T2f = FMA(T2d, T2e, T2c);
Chris@42 503 T2t = W[24];
Chris@42 504 T2w = W[25];
Chris@42 505 }
Chris@42 506 {
Chris@42 507 E T2h, T2k, T2j, T4z, T2i, T4G, T2v, T2g;
Chris@42 508 T2h = cr[WS(rs, 21)];
Chris@42 509 T2k = ci[WS(rs, 21)];
Chris@42 510 T4G = T2t * T2x;
Chris@42 511 T2v = T2t * T2u;
Chris@42 512 T2g = W[40];
Chris@42 513 T2j = W[41];
Chris@42 514 T4H = FNMS(T2w, T2u, T4G);
Chris@42 515 T2y = FMA(T2w, T2x, T2v);
Chris@42 516 T4z = T2g * T2k;
Chris@42 517 T2i = T2g * T2h;
Chris@42 518 T2o = cr[WS(rs, 29)];
Chris@42 519 T2r = ci[WS(rs, 29)];
Chris@42 520 T4A = FNMS(T2j, T2h, T4z);
Chris@42 521 T2l = FMA(T2j, T2k, T2i);
Chris@42 522 T2n = W[56];
Chris@42 523 T2q = W[57];
Chris@42 524 }
Chris@42 525 }
Chris@42 526 T4B = T4y - T4A;
Chris@42 527 T6X = T4y + T4A;
Chris@42 528 T2m = T2f + T2l;
Chris@42 529 T4w = T2f - T2l;
Chris@42 530 T4E = T2n * T2r;
Chris@42 531 T2p = T2n * T2o;
Chris@42 532 T4F = FNMS(T2q, T2o, T4E);
Chris@42 533 T2s = FMA(T2q, T2r, T2p);
Chris@42 534 }
Chris@42 535 }
Chris@42 536 {
Chris@42 537 E T6E, T8j, T6Y, T73, T6H, T8k, T5S, T8O, T8N, T5V, T6g, T6d, T69, T66, T5O;
Chris@42 538 E T5R;
Chris@42 539 {
Chris@42 540 E T4T, T4C, T4J, T4U, T7S, T7V;
Chris@42 541 {
Chris@42 542 E T7C, TO, T80, T7Z, T8e, T89, T8d, T1H, T8b, T3v, T7T, T7L, T7U, T7Q, T2A;
Chris@42 543 E T7P, T7K, T7W, T1I;
Chris@42 544 {
Chris@42 545 E T7X, T7Y, T7J, T82, T88;
Chris@42 546 {
Chris@42 547 E Tm, T4I, T6W, T4D, T2z, TN;
Chris@42 548 T6E = T8 - Tl;
Chris@42 549 Tm = T8 + Tl;
Chris@42 550 T4T = T4w + T4B;
Chris@42 551 T4C = T4w - T4B;
Chris@42 552 T4I = T4F - T4H;
Chris@42 553 T6W = T4F + T4H;
Chris@42 554 T4D = T2s - T2y;
Chris@42 555 T2z = T2s + T2y;
Chris@42 556 TN = Tz + TM;
Chris@42 557 T8j = Tz - TM;
Chris@42 558 T6Y = T6W - T6X;
Chris@42 559 T7J = T6X + T6W;
Chris@42 560 T4J = T4D + T4I;
Chris@42 561 T4U = T4I - T4D;
Chris@42 562 T2A = T2m + T2z;
Chris@42 563 T73 = T2m - T2z;
Chris@42 564 T7C = Tm - TN;
Chris@42 565 TO = Tm + TN;
Chris@42 566 }
Chris@42 567 T7P = T7N - T7O;
Chris@42 568 T7X = T7N + T7O;
Chris@42 569 T7Y = T7I + T7J;
Chris@42 570 T7K = T7I - T7J;
Chris@42 571 T6H = T6F - T6G;
Chris@42 572 T82 = T6G + T6F;
Chris@42 573 T88 = T83 + T87;
Chris@42 574 T8k = T87 - T83;
Chris@42 575 T80 = T7Y + T7X;
Chris@42 576 T7Z = T7X - T7Y;
Chris@42 577 T8e = T88 - T82;
Chris@42 578 T89 = T82 + T88;
Chris@42 579 }
Chris@42 580 {
Chris@42 581 E T7H, T7M, T2B, T3u;
Chris@42 582 T7H = T29 - T2A;
Chris@42 583 T2B = T29 + T2A;
Chris@42 584 T3u = T32 + T3t;
Chris@42 585 T7M = T32 - T3t;
Chris@42 586 T8d = T1f - T1G;
Chris@42 587 T1H = T1f + T1G;
Chris@42 588 T8b = T3u - T2B;
Chris@42 589 T3v = T2B + T3u;
Chris@42 590 T7T = T7H - T7K;
Chris@42 591 T7L = T7H + T7K;
Chris@42 592 T7U = T7M + T7P;
Chris@42 593 T7Q = T7M - T7P;
Chris@42 594 }
Chris@42 595 T7W = TO - T1H;
Chris@42 596 T1I = TO + T1H;
Chris@42 597 {
Chris@42 598 E T8g, T8h, T8f, T8i;
Chris@42 599 {
Chris@42 600 E T7R, T8c, T8a, T7G, T81, T7F;
Chris@42 601 T8g = T7Q - T7L;
Chris@42 602 T7R = T7L + T7Q;
Chris@42 603 T81 = T7E + T7D;
Chris@42 604 T7F = T7D - T7E;
Chris@42 605 cr[0] = T1I + T3v;
Chris@42 606 ci[WS(rs, 15)] = T1I - T3v;
Chris@42 607 ci[WS(rs, 7)] = T7W + T7Z;
Chris@42 608 cr[WS(rs, 8)] = T7W - T7Z;
Chris@42 609 T8c = T89 - T81;
Chris@42 610 T8a = T81 + T89;
Chris@42 611 T7G = T7C - T7F;
Chris@42 612 T7S = T7C + T7F;
Chris@42 613 T8h = T8e - T8d;
Chris@42 614 T8f = T8d + T8e;
Chris@42 615 ci[WS(rs, 23)] = T8b + T8c;
Chris@42 616 cr[WS(rs, 24)] = T8b - T8c;
Chris@42 617 ci[WS(rs, 31)] = T80 + T8a;
Chris@42 618 cr[WS(rs, 16)] = T80 - T8a;
Chris@42 619 cr[WS(rs, 4)] = FMA(KP707106781, T7R, T7G);
Chris@42 620 ci[WS(rs, 11)] = FNMS(KP707106781, T7R, T7G);
Chris@42 621 }
Chris@42 622 T8i = T7U - T7T;
Chris@42 623 T7V = T7T + T7U;
Chris@42 624 ci[WS(rs, 19)] = FMA(KP707106781, T8g, T8f);
Chris@42 625 cr[WS(rs, 28)] = FMS(KP707106781, T8g, T8f);
Chris@42 626 ci[WS(rs, 27)] = FMA(KP707106781, T8i, T8h);
Chris@42 627 cr[WS(rs, 20)] = FMS(KP707106781, T8i, T8h);
Chris@42 628 }
Chris@42 629 }
Chris@42 630 {
Chris@42 631 E T5C, T3S, T8C, T4n, T8H, T8B, T8I, T5F, T4L, T5H, T5M, T5Q, T5A, T5w, T4V;
Chris@42 632 {
Chris@42 633 E T5D, T47, T4m, T5E, T8z, T8A, T3C, T3R, T5j, T5u;
Chris@42 634 T5S = T3w + T3B;
Chris@42 635 T3C = T3w - T3B;
Chris@42 636 T3R = T3J + T3Q;
Chris@42 637 T8O = T3Q - T3J;
Chris@42 638 T5D = FNMS(KP414213562, T3Z, T46);
Chris@42 639 T47 = FMA(KP414213562, T46, T3Z);
Chris@42 640 ci[WS(rs, 3)] = FMA(KP707106781, T7V, T7S);
Chris@42 641 cr[WS(rs, 12)] = FNMS(KP707106781, T7V, T7S);
Chris@42 642 T5C = FMA(KP707106781, T3R, T3C);
Chris@42 643 T3S = FNMS(KP707106781, T3R, T3C);
Chris@42 644 T4m = FNMS(KP414213562, T4l, T4e);
Chris@42 645 T5E = FMA(KP414213562, T4e, T4l);
Chris@42 646 T8N = T8y - T8x;
Chris@42 647 T8z = T8x + T8y;
Chris@42 648 T8A = T5T - T5U;
Chris@42 649 T5V = T5T + T5U;
Chris@42 650 T8C = T47 + T4m;
Chris@42 651 T4n = T47 - T4m;
Chris@42 652 T8H = FNMS(KP707106781, T8A, T8z);
Chris@42 653 T8B = FMA(KP707106781, T8A, T8z);
Chris@42 654 T6g = T5i - T5b;
Chris@42 655 T5j = T5b + T5i;
Chris@42 656 T5u = T5s - T5t;
Chris@42 657 T6d = T5t + T5s;
Chris@42 658 {
Chris@42 659 E T5K, T5k, T5L, T5v, T4K;
Chris@42 660 T69 = T4J - T4C;
Chris@42 661 T4K = T4C + T4J;
Chris@42 662 T8I = T5E - T5D;
Chris@42 663 T5F = T5D + T5E;
Chris@42 664 T5K = FMA(KP707106781, T5j, T54);
Chris@42 665 T5k = FNMS(KP707106781, T5j, T54);
Chris@42 666 T5L = FMA(KP707106781, T5u, T5r);
Chris@42 667 T5v = FNMS(KP707106781, T5u, T5r);
Chris@42 668 T4L = FNMS(KP707106781, T4K, T4v);
Chris@42 669 T5H = FMA(KP707106781, T4K, T4v);
Chris@42 670 T5M = FNMS(KP198912367, T5L, T5K);
Chris@42 671 T5Q = FMA(KP198912367, T5K, T5L);
Chris@42 672 T5A = FNMS(KP668178637, T5k, T5v);
Chris@42 673 T5w = FMA(KP668178637, T5v, T5k);
Chris@42 674 T4V = T4T + T4U;
Chris@42 675 T66 = T4T - T4U;
Chris@42 676 }
Chris@42 677 }
Chris@42 678 {
Chris@42 679 E T5y, T4o, T8J, T8L, T5I, T4W;
Chris@42 680 T5y = FNMS(KP923879532, T4n, T3S);
Chris@42 681 T4o = FMA(KP923879532, T4n, T3S);
Chris@42 682 T8J = FMA(KP923879532, T8I, T8H);
Chris@42 683 T8L = FNMS(KP923879532, T8I, T8H);
Chris@42 684 T5I = FMA(KP707106781, T4V, T4S);
Chris@42 685 T4W = FNMS(KP707106781, T4V, T4S);
Chris@42 686 {
Chris@42 687 E T8G, T8F, T8D, T8E;
Chris@42 688 {
Chris@42 689 E T5G, T5P, T5z, T4X, T5N, T5J;
Chris@42 690 T5O = FNMS(KP923879532, T5F, T5C);
Chris@42 691 T5G = FMA(KP923879532, T5F, T5C);
Chris@42 692 T5J = FNMS(KP198912367, T5I, T5H);
Chris@42 693 T5P = FMA(KP198912367, T5H, T5I);
Chris@42 694 T5z = FNMS(KP668178637, T4L, T4W);
Chris@42 695 T4X = FMA(KP668178637, T4W, T4L);
Chris@42 696 T5N = T5J + T5M;
Chris@42 697 T8G = T5M - T5J;
Chris@42 698 T8F = FNMS(KP923879532, T8C, T8B);
Chris@42 699 T8D = FMA(KP923879532, T8C, T8B);
Chris@42 700 {
Chris@42 701 E T5B, T8K, T8M, T5x;
Chris@42 702 T5B = T5z + T5A;
Chris@42 703 T8K = T5z - T5A;
Chris@42 704 T8M = T5w - T4X;
Chris@42 705 T5x = T4X + T5w;
Chris@42 706 ci[0] = FMA(KP980785280, T5N, T5G);
Chris@42 707 cr[WS(rs, 15)] = FNMS(KP980785280, T5N, T5G);
Chris@42 708 ci[WS(rs, 4)] = FNMS(KP831469612, T5B, T5y);
Chris@42 709 cr[WS(rs, 11)] = FMA(KP831469612, T5B, T5y);
Chris@42 710 ci[WS(rs, 28)] = FMA(KP831469612, T8K, T8J);
Chris@42 711 cr[WS(rs, 19)] = FMS(KP831469612, T8K, T8J);
Chris@42 712 ci[WS(rs, 20)] = FMA(KP831469612, T8M, T8L);
Chris@42 713 cr[WS(rs, 27)] = FMS(KP831469612, T8M, T8L);
Chris@42 714 cr[WS(rs, 3)] = FMA(KP831469612, T5x, T4o);
Chris@42 715 ci[WS(rs, 12)] = FNMS(KP831469612, T5x, T4o);
Chris@42 716 T8E = T5Q - T5P;
Chris@42 717 T5R = T5P + T5Q;
Chris@42 718 }
Chris@42 719 }
Chris@42 720 ci[WS(rs, 16)] = FMA(KP980785280, T8E, T8D);
Chris@42 721 cr[WS(rs, 31)] = FMS(KP980785280, T8E, T8D);
Chris@42 722 ci[WS(rs, 24)] = FMA(KP980785280, T8G, T8F);
Chris@42 723 cr[WS(rs, 23)] = FMS(KP980785280, T8G, T8F);
Chris@42 724 }
Chris@42 725 }
Chris@42 726 }
Chris@42 727 }
Chris@42 728 {
Chris@42 729 E T7y, T8q, T8p, T7B;
Chris@42 730 {
Chris@42 731 E T7a, T7m, T6I, T7f, T7A, T7w, T8r, T8l, T8m, T6T, T7k, T75, T8s, T7p, T7z;
Chris@42 732 E T7t;
Chris@42 733 {
Chris@42 734 E T7n, T6N, T6S, T7o, T7u, T7v;
Chris@42 735 T7a = T76 - T79;
Chris@42 736 T7u = T76 + T79;
Chris@42 737 cr[WS(rs, 7)] = FMA(KP980785280, T5R, T5O);
Chris@42 738 ci[WS(rs, 8)] = FNMS(KP980785280, T5R, T5O);
Chris@42 739 T7m = T6E + T6H;
Chris@42 740 T6I = T6E - T6H;
Chris@42 741 T7v = T7e - T7d;
Chris@42 742 T7f = T7d + T7e;
Chris@42 743 T7n = T6J - T6M;
Chris@42 744 T6N = T6J + T6M;
Chris@42 745 T7A = FMA(KP414213562, T7u, T7v);
Chris@42 746 T7w = FNMS(KP414213562, T7v, T7u);
Chris@42 747 T8r = T8k - T8j;
Chris@42 748 T8l = T8j + T8k;
Chris@42 749 T6S = T6O - T6R;
Chris@42 750 T7o = T6O + T6R;
Chris@42 751 {
Chris@42 752 E T7r, T7s, T6Z, T74;
Chris@42 753 T7r = T6V + T6Y;
Chris@42 754 T6Z = T6V - T6Y;
Chris@42 755 T74 = T72 - T73;
Chris@42 756 T7s = T72 + T73;
Chris@42 757 T8m = T6N - T6S;
Chris@42 758 T6T = T6N + T6S;
Chris@42 759 T7k = FNMS(KP414213562, T6Z, T74);
Chris@42 760 T75 = FMA(KP414213562, T74, T6Z);
Chris@42 761 T8s = T7o - T7n;
Chris@42 762 T7p = T7n + T7o;
Chris@42 763 T7z = FMA(KP414213562, T7r, T7s);
Chris@42 764 T7t = FNMS(KP414213562, T7s, T7r);
Chris@42 765 }
Chris@42 766 }
Chris@42 767 {
Chris@42 768 E T7i, T6U, T8t, T8v, T7j, T7g;
Chris@42 769 T7i = FNMS(KP707106781, T6T, T6I);
Chris@42 770 T6U = FMA(KP707106781, T6T, T6I);
Chris@42 771 T8t = FMA(KP707106781, T8s, T8r);
Chris@42 772 T8v = FNMS(KP707106781, T8s, T8r);
Chris@42 773 T7j = FMA(KP414213562, T7a, T7f);
Chris@42 774 T7g = FNMS(KP414213562, T7f, T7a);
Chris@42 775 {
Chris@42 776 E T7q, T7x, T8n, T8o;
Chris@42 777 T7y = FNMS(KP707106781, T7p, T7m);
Chris@42 778 T7q = FMA(KP707106781, T7p, T7m);
Chris@42 779 {
Chris@42 780 E T7l, T8u, T8w, T7h;
Chris@42 781 T7l = T7j - T7k;
Chris@42 782 T8u = T7k + T7j;
Chris@42 783 T8w = T7g - T75;
Chris@42 784 T7h = T75 + T7g;
Chris@42 785 ci[WS(rs, 5)] = FMA(KP923879532, T7l, T7i);
Chris@42 786 cr[WS(rs, 10)] = FNMS(KP923879532, T7l, T7i);
Chris@42 787 ci[WS(rs, 29)] = FMA(KP923879532, T8u, T8t);
Chris@42 788 cr[WS(rs, 18)] = FMS(KP923879532, T8u, T8t);
Chris@42 789 ci[WS(rs, 21)] = FMA(KP923879532, T8w, T8v);
Chris@42 790 cr[WS(rs, 26)] = FMS(KP923879532, T8w, T8v);
Chris@42 791 cr[WS(rs, 2)] = FMA(KP923879532, T7h, T6U);
Chris@42 792 ci[WS(rs, 13)] = FNMS(KP923879532, T7h, T6U);
Chris@42 793 T7x = T7t + T7w;
Chris@42 794 T8q = T7w - T7t;
Chris@42 795 }
Chris@42 796 T8p = FNMS(KP707106781, T8m, T8l);
Chris@42 797 T8n = FMA(KP707106781, T8m, T8l);
Chris@42 798 T8o = T7A - T7z;
Chris@42 799 T7B = T7z + T7A;
Chris@42 800 ci[WS(rs, 1)] = FMA(KP923879532, T7x, T7q);
Chris@42 801 cr[WS(rs, 14)] = FNMS(KP923879532, T7x, T7q);
Chris@42 802 ci[WS(rs, 17)] = FMA(KP923879532, T8o, T8n);
Chris@42 803 cr[WS(rs, 30)] = FMS(KP923879532, T8o, T8n);
Chris@42 804 }
Chris@42 805 }
Chris@42 806 }
Chris@42 807 {
Chris@42 808 E T6o, T5W, T8W, T63, T8V, T8P, T8Q, T6r, T6e, T6w;
Chris@42 809 {
Chris@42 810 E T6q, T6p, T5Z, T62;
Chris@42 811 ci[WS(rs, 25)] = FMA(KP923879532, T8q, T8p);
Chris@42 812 cr[WS(rs, 22)] = FMS(KP923879532, T8q, T8p);
Chris@42 813 cr[WS(rs, 6)] = FMA(KP923879532, T7B, T7y);
Chris@42 814 ci[WS(rs, 9)] = FNMS(KP923879532, T7B, T7y);
Chris@42 815 T6q = FNMS(KP414213562, T5X, T5Y);
Chris@42 816 T5Z = FMA(KP414213562, T5Y, T5X);
Chris@42 817 T62 = FNMS(KP414213562, T61, T60);
Chris@42 818 T6p = FMA(KP414213562, T60, T61);
Chris@42 819 T6o = FNMS(KP707106781, T5V, T5S);
Chris@42 820 T5W = FMA(KP707106781, T5V, T5S);
Chris@42 821 T8W = T5Z - T62;
Chris@42 822 T63 = T5Z + T62;
Chris@42 823 T8V = FNMS(KP707106781, T8O, T8N);
Chris@42 824 T8P = FMA(KP707106781, T8O, T8N);
Chris@42 825 T8Q = T6q + T6p;
Chris@42 826 T6r = T6p - T6q;
Chris@42 827 T6e = FMA(KP707106781, T6d, T6c);
Chris@42 828 T6w = FNMS(KP707106781, T6d, T6c);
Chris@42 829 }
Chris@42 830 {
Chris@42 831 E T6k, T8U, T6z, T6n, T8S, T8T, T8R, T6s;
Chris@42 832 {
Chris@42 833 E T64, T6y, T6l, T6i, T6v, T6m, T6b, T8X, T8Z, T8Y, T6j, T90;
Chris@42 834 {
Chris@42 835 E T6C, T6B, T6x, T6h;
Chris@42 836 T6k = FNMS(KP923879532, T63, T5W);
Chris@42 837 T64 = FMA(KP923879532, T63, T5W);
Chris@42 838 T6x = FNMS(KP707106781, T6g, T6f);
Chris@42 839 T6h = FMA(KP707106781, T6g, T6f);
Chris@42 840 {
Chris@42 841 E T6t, T67, T6u, T6a;
Chris@42 842 T6t = FNMS(KP707106781, T66, T65);
Chris@42 843 T67 = FMA(KP707106781, T66, T65);
Chris@42 844 T6u = FNMS(KP707106781, T69, T68);
Chris@42 845 T6a = FMA(KP707106781, T69, T68);
Chris@42 846 T6y = FMA(KP668178637, T6x, T6w);
Chris@42 847 T6C = FNMS(KP668178637, T6w, T6x);
Chris@42 848 T6l = FMA(KP198912367, T6e, T6h);
Chris@42 849 T6i = FNMS(KP198912367, T6h, T6e);
Chris@42 850 T6v = FNMS(KP668178637, T6u, T6t);
Chris@42 851 T6B = FMA(KP668178637, T6t, T6u);
Chris@42 852 T6m = FNMS(KP198912367, T67, T6a);
Chris@42 853 T6b = FMA(KP198912367, T6a, T67);
Chris@42 854 }
Chris@42 855 T8X = FMA(KP923879532, T8W, T8V);
Chris@42 856 T8Z = FNMS(KP923879532, T8W, T8V);
Chris@42 857 T6D = T6B - T6C;
Chris@42 858 T8Y = T6B + T6C;
Chris@42 859 }
Chris@42 860 T8U = T6i - T6b;
Chris@42 861 T6j = T6b + T6i;
Chris@42 862 T90 = T6y - T6v;
Chris@42 863 T6z = T6v + T6y;
Chris@42 864 ci[WS(rs, 18)] = FNMS(KP831469612, T8Y, T8X);
Chris@42 865 cr[WS(rs, 29)] = -(FMA(KP831469612, T8Y, T8X));
Chris@42 866 cr[WS(rs, 1)] = FMA(KP980785280, T6j, T64);
Chris@42 867 ci[WS(rs, 14)] = FNMS(KP980785280, T6j, T64);
Chris@42 868 cr[WS(rs, 21)] = FMS(KP831469612, T90, T8Z);
Chris@42 869 ci[WS(rs, 26)] = FMA(KP831469612, T90, T8Z);
Chris@42 870 T6n = T6l - T6m;
Chris@42 871 T8S = T6m + T6l;
Chris@42 872 }
Chris@42 873 T6A = FNMS(KP923879532, T6r, T6o);
Chris@42 874 T6s = FMA(KP923879532, T6r, T6o);
Chris@42 875 T8T = FNMS(KP923879532, T8Q, T8P);
Chris@42 876 T8R = FMA(KP923879532, T8Q, T8P);
Chris@42 877 ci[WS(rs, 6)] = FMA(KP980785280, T6n, T6k);
Chris@42 878 cr[WS(rs, 9)] = FNMS(KP980785280, T6n, T6k);
Chris@42 879 ci[WS(rs, 2)] = FMA(KP831469612, T6z, T6s);
Chris@42 880 cr[WS(rs, 13)] = FNMS(KP831469612, T6z, T6s);
Chris@42 881 ci[WS(rs, 30)] = FMA(KP980785280, T8S, T8R);
Chris@42 882 cr[WS(rs, 17)] = FMS(KP980785280, T8S, T8R);
Chris@42 883 ci[WS(rs, 22)] = FMA(KP980785280, T8U, T8T);
Chris@42 884 cr[WS(rs, 25)] = FMS(KP980785280, T8U, T8T);
Chris@42 885 }
Chris@42 886 }
Chris@42 887 }
Chris@42 888 }
Chris@42 889 }
Chris@42 890 cr[WS(rs, 5)] = FMA(KP831469612, T6D, T6A);
Chris@42 891 ci[WS(rs, 10)] = FNMS(KP831469612, T6D, T6A);
Chris@42 892 }
Chris@42 893 }
Chris@42 894 }
Chris@42 895
Chris@42 896 static const tw_instr twinstr[] = {
Chris@42 897 {TW_FULL, 1, 32},
Chris@42 898 {TW_NEXT, 1, 0}
Chris@42 899 };
Chris@42 900
Chris@42 901 static const hc2hc_desc desc = { 32, "hf_32", twinstr, &GENUS, {236, 62, 198, 0} };
Chris@42 902
Chris@42 903 void X(codelet_hf_32) (planner *p) {
Chris@42 904 X(khc2hc_register) (p, hf_32, &desc);
Chris@42 905 }
Chris@42 906 #else /* HAVE_FMA */
Chris@42 907
Chris@42 908 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 32 -dit -name hf_32 -include hf.h */
Chris@42 909
Chris@42 910 /*
Chris@42 911 * This function contains 434 FP additions, 208 FP multiplications,
Chris@42 912 * (or, 340 additions, 114 multiplications, 94 fused multiply/add),
Chris@42 913 * 96 stack variables, 7 constants, and 128 memory accesses
Chris@42 914 */
Chris@42 915 #include "hf.h"
Chris@42 916
Chris@42 917 static void hf_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 918 {
Chris@42 919 DK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@42 920 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 921 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 922 DK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@42 923 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 924 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 925 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 926 {
Chris@42 927 INT m;
Chris@42 928 for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 929 E Tj, T5F, T7C, T7Q, T35, T4T, T78, T7m, T1Q, T61, T5Y, T6J, T3K, T56, T41;
Chris@42 930 E T59, T2B, T67, T6e, T6O, T4b, T5g, T4s, T5d, TG, T7l, T5I, T73, T3a, T4U;
Chris@42 931 E T3f, T4V, T14, T5K, T5N, T6F, T3m, T4Z, T3r, T4Y, T1r, T5P, T5S, T6E, T3x;
Chris@42 932 E T52, T3C, T51, T2d, T5Z, T64, T6K, T3V, T5a, T44, T57, T2Y, T6f, T6a, T6P;
Chris@42 933 E T4m, T5e, T4v, T5h;
Chris@42 934 {
Chris@42 935 E T1, T76, T6, T75, Tc, T32, Th, T33;
Chris@42 936 T1 = cr[0];
Chris@42 937 T76 = ci[0];
Chris@42 938 {
Chris@42 939 E T3, T5, T2, T4;
Chris@42 940 T3 = cr[WS(rs, 16)];
Chris@42 941 T5 = ci[WS(rs, 16)];
Chris@42 942 T2 = W[30];
Chris@42 943 T4 = W[31];
Chris@42 944 T6 = FMA(T2, T3, T4 * T5);
Chris@42 945 T75 = FNMS(T4, T3, T2 * T5);
Chris@42 946 }
Chris@42 947 {
Chris@42 948 E T9, Tb, T8, Ta;
Chris@42 949 T9 = cr[WS(rs, 8)];
Chris@42 950 Tb = ci[WS(rs, 8)];
Chris@42 951 T8 = W[14];
Chris@42 952 Ta = W[15];
Chris@42 953 Tc = FMA(T8, T9, Ta * Tb);
Chris@42 954 T32 = FNMS(Ta, T9, T8 * Tb);
Chris@42 955 }
Chris@42 956 {
Chris@42 957 E Te, Tg, Td, Tf;
Chris@42 958 Te = cr[WS(rs, 24)];
Chris@42 959 Tg = ci[WS(rs, 24)];
Chris@42 960 Td = W[46];
Chris@42 961 Tf = W[47];
Chris@42 962 Th = FMA(Td, Te, Tf * Tg);
Chris@42 963 T33 = FNMS(Tf, Te, Td * Tg);
Chris@42 964 }
Chris@42 965 {
Chris@42 966 E T7, Ti, T7A, T7B;
Chris@42 967 T7 = T1 + T6;
Chris@42 968 Ti = Tc + Th;
Chris@42 969 Tj = T7 + Ti;
Chris@42 970 T5F = T7 - Ti;
Chris@42 971 T7A = Tc - Th;
Chris@42 972 T7B = T76 - T75;
Chris@42 973 T7C = T7A + T7B;
Chris@42 974 T7Q = T7B - T7A;
Chris@42 975 }
Chris@42 976 {
Chris@42 977 E T31, T34, T74, T77;
Chris@42 978 T31 = T1 - T6;
Chris@42 979 T34 = T32 - T33;
Chris@42 980 T35 = T31 + T34;
Chris@42 981 T4T = T31 - T34;
Chris@42 982 T74 = T32 + T33;
Chris@42 983 T77 = T75 + T76;
Chris@42 984 T78 = T74 + T77;
Chris@42 985 T7m = T77 - T74;
Chris@42 986 }
Chris@42 987 }
Chris@42 988 {
Chris@42 989 E T1y, T3X, T1O, T3I, T1D, T3Y, T1J, T3H;
Chris@42 990 {
Chris@42 991 E T1v, T1x, T1u, T1w;
Chris@42 992 T1v = cr[WS(rs, 1)];
Chris@42 993 T1x = ci[WS(rs, 1)];
Chris@42 994 T1u = W[0];
Chris@42 995 T1w = W[1];
Chris@42 996 T1y = FMA(T1u, T1v, T1w * T1x);
Chris@42 997 T3X = FNMS(T1w, T1v, T1u * T1x);
Chris@42 998 }
Chris@42 999 {
Chris@42 1000 E T1L, T1N, T1K, T1M;
Chris@42 1001 T1L = cr[WS(rs, 25)];
Chris@42 1002 T1N = ci[WS(rs, 25)];
Chris@42 1003 T1K = W[48];
Chris@42 1004 T1M = W[49];
Chris@42 1005 T1O = FMA(T1K, T1L, T1M * T1N);
Chris@42 1006 T3I = FNMS(T1M, T1L, T1K * T1N);
Chris@42 1007 }
Chris@42 1008 {
Chris@42 1009 E T1A, T1C, T1z, T1B;
Chris@42 1010 T1A = cr[WS(rs, 17)];
Chris@42 1011 T1C = ci[WS(rs, 17)];
Chris@42 1012 T1z = W[32];
Chris@42 1013 T1B = W[33];
Chris@42 1014 T1D = FMA(T1z, T1A, T1B * T1C);
Chris@42 1015 T3Y = FNMS(T1B, T1A, T1z * T1C);
Chris@42 1016 }
Chris@42 1017 {
Chris@42 1018 E T1G, T1I, T1F, T1H;
Chris@42 1019 T1G = cr[WS(rs, 9)];
Chris@42 1020 T1I = ci[WS(rs, 9)];
Chris@42 1021 T1F = W[16];
Chris@42 1022 T1H = W[17];
Chris@42 1023 T1J = FMA(T1F, T1G, T1H * T1I);
Chris@42 1024 T3H = FNMS(T1H, T1G, T1F * T1I);
Chris@42 1025 }
Chris@42 1026 {
Chris@42 1027 E T1E, T1P, T5W, T5X;
Chris@42 1028 T1E = T1y + T1D;
Chris@42 1029 T1P = T1J + T1O;
Chris@42 1030 T1Q = T1E + T1P;
Chris@42 1031 T61 = T1E - T1P;
Chris@42 1032 T5W = T3X + T3Y;
Chris@42 1033 T5X = T3H + T3I;
Chris@42 1034 T5Y = T5W - T5X;
Chris@42 1035 T6J = T5W + T5X;
Chris@42 1036 }
Chris@42 1037 {
Chris@42 1038 E T3G, T3J, T3Z, T40;
Chris@42 1039 T3G = T1y - T1D;
Chris@42 1040 T3J = T3H - T3I;
Chris@42 1041 T3K = T3G + T3J;
Chris@42 1042 T56 = T3G - T3J;
Chris@42 1043 T3Z = T3X - T3Y;
Chris@42 1044 T40 = T1J - T1O;
Chris@42 1045 T41 = T3Z - T40;
Chris@42 1046 T59 = T3Z + T40;
Chris@42 1047 }
Chris@42 1048 }
Chris@42 1049 {
Chris@42 1050 E T2j, T47, T2z, T4q, T2o, T48, T2u, T4p;
Chris@42 1051 {
Chris@42 1052 E T2g, T2i, T2f, T2h;
Chris@42 1053 T2g = cr[WS(rs, 31)];
Chris@42 1054 T2i = ci[WS(rs, 31)];
Chris@42 1055 T2f = W[60];
Chris@42 1056 T2h = W[61];
Chris@42 1057 T2j = FMA(T2f, T2g, T2h * T2i);
Chris@42 1058 T47 = FNMS(T2h, T2g, T2f * T2i);
Chris@42 1059 }
Chris@42 1060 {
Chris@42 1061 E T2w, T2y, T2v, T2x;
Chris@42 1062 T2w = cr[WS(rs, 23)];
Chris@42 1063 T2y = ci[WS(rs, 23)];
Chris@42 1064 T2v = W[44];
Chris@42 1065 T2x = W[45];
Chris@42 1066 T2z = FMA(T2v, T2w, T2x * T2y);
Chris@42 1067 T4q = FNMS(T2x, T2w, T2v * T2y);
Chris@42 1068 }
Chris@42 1069 {
Chris@42 1070 E T2l, T2n, T2k, T2m;
Chris@42 1071 T2l = cr[WS(rs, 15)];
Chris@42 1072 T2n = ci[WS(rs, 15)];
Chris@42 1073 T2k = W[28];
Chris@42 1074 T2m = W[29];
Chris@42 1075 T2o = FMA(T2k, T2l, T2m * T2n);
Chris@42 1076 T48 = FNMS(T2m, T2l, T2k * T2n);
Chris@42 1077 }
Chris@42 1078 {
Chris@42 1079 E T2r, T2t, T2q, T2s;
Chris@42 1080 T2r = cr[WS(rs, 7)];
Chris@42 1081 T2t = ci[WS(rs, 7)];
Chris@42 1082 T2q = W[12];
Chris@42 1083 T2s = W[13];
Chris@42 1084 T2u = FMA(T2q, T2r, T2s * T2t);
Chris@42 1085 T4p = FNMS(T2s, T2r, T2q * T2t);
Chris@42 1086 }
Chris@42 1087 {
Chris@42 1088 E T2p, T2A, T6c, T6d;
Chris@42 1089 T2p = T2j + T2o;
Chris@42 1090 T2A = T2u + T2z;
Chris@42 1091 T2B = T2p + T2A;
Chris@42 1092 T67 = T2p - T2A;
Chris@42 1093 T6c = T47 + T48;
Chris@42 1094 T6d = T4p + T4q;
Chris@42 1095 T6e = T6c - T6d;
Chris@42 1096 T6O = T6c + T6d;
Chris@42 1097 }
Chris@42 1098 {
Chris@42 1099 E T49, T4a, T4o, T4r;
Chris@42 1100 T49 = T47 - T48;
Chris@42 1101 T4a = T2u - T2z;
Chris@42 1102 T4b = T49 - T4a;
Chris@42 1103 T5g = T49 + T4a;
Chris@42 1104 T4o = T2j - T2o;
Chris@42 1105 T4r = T4p - T4q;
Chris@42 1106 T4s = T4o + T4r;
Chris@42 1107 T5d = T4o - T4r;
Chris@42 1108 }
Chris@42 1109 }
Chris@42 1110 {
Chris@42 1111 E To, T37, TE, T3d, Tt, T38, Tz, T3c;
Chris@42 1112 {
Chris@42 1113 E Tl, Tn, Tk, Tm;
Chris@42 1114 Tl = cr[WS(rs, 4)];
Chris@42 1115 Tn = ci[WS(rs, 4)];
Chris@42 1116 Tk = W[6];
Chris@42 1117 Tm = W[7];
Chris@42 1118 To = FMA(Tk, Tl, Tm * Tn);
Chris@42 1119 T37 = FNMS(Tm, Tl, Tk * Tn);
Chris@42 1120 }
Chris@42 1121 {
Chris@42 1122 E TB, TD, TA, TC;
Chris@42 1123 TB = cr[WS(rs, 12)];
Chris@42 1124 TD = ci[WS(rs, 12)];
Chris@42 1125 TA = W[22];
Chris@42 1126 TC = W[23];
Chris@42 1127 TE = FMA(TA, TB, TC * TD);
Chris@42 1128 T3d = FNMS(TC, TB, TA * TD);
Chris@42 1129 }
Chris@42 1130 {
Chris@42 1131 E Tq, Ts, Tp, Tr;
Chris@42 1132 Tq = cr[WS(rs, 20)];
Chris@42 1133 Ts = ci[WS(rs, 20)];
Chris@42 1134 Tp = W[38];
Chris@42 1135 Tr = W[39];
Chris@42 1136 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@42 1137 T38 = FNMS(Tr, Tq, Tp * Ts);
Chris@42 1138 }
Chris@42 1139 {
Chris@42 1140 E Tw, Ty, Tv, Tx;
Chris@42 1141 Tw = cr[WS(rs, 28)];
Chris@42 1142 Ty = ci[WS(rs, 28)];
Chris@42 1143 Tv = W[54];
Chris@42 1144 Tx = W[55];
Chris@42 1145 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@42 1146 T3c = FNMS(Tx, Tw, Tv * Ty);
Chris@42 1147 }
Chris@42 1148 {
Chris@42 1149 E Tu, TF, T5G, T5H;
Chris@42 1150 Tu = To + Tt;
Chris@42 1151 TF = Tz + TE;
Chris@42 1152 TG = Tu + TF;
Chris@42 1153 T7l = Tu - TF;
Chris@42 1154 T5G = T3c + T3d;
Chris@42 1155 T5H = T37 + T38;
Chris@42 1156 T5I = T5G - T5H;
Chris@42 1157 T73 = T5H + T5G;
Chris@42 1158 }
Chris@42 1159 {
Chris@42 1160 E T36, T39, T3b, T3e;
Chris@42 1161 T36 = To - Tt;
Chris@42 1162 T39 = T37 - T38;
Chris@42 1163 T3a = T36 + T39;
Chris@42 1164 T4U = T36 - T39;
Chris@42 1165 T3b = Tz - TE;
Chris@42 1166 T3e = T3c - T3d;
Chris@42 1167 T3f = T3b - T3e;
Chris@42 1168 T4V = T3b + T3e;
Chris@42 1169 }
Chris@42 1170 }
Chris@42 1171 {
Chris@42 1172 E TM, T3n, T12, T3k, TR, T3o, TX, T3j;
Chris@42 1173 {
Chris@42 1174 E TJ, TL, TI, TK;
Chris@42 1175 TJ = cr[WS(rs, 2)];
Chris@42 1176 TL = ci[WS(rs, 2)];
Chris@42 1177 TI = W[2];
Chris@42 1178 TK = W[3];
Chris@42 1179 TM = FMA(TI, TJ, TK * TL);
Chris@42 1180 T3n = FNMS(TK, TJ, TI * TL);
Chris@42 1181 }
Chris@42 1182 {
Chris@42 1183 E TZ, T11, TY, T10;
Chris@42 1184 TZ = cr[WS(rs, 26)];
Chris@42 1185 T11 = ci[WS(rs, 26)];
Chris@42 1186 TY = W[50];
Chris@42 1187 T10 = W[51];
Chris@42 1188 T12 = FMA(TY, TZ, T10 * T11);
Chris@42 1189 T3k = FNMS(T10, TZ, TY * T11);
Chris@42 1190 }
Chris@42 1191 {
Chris@42 1192 E TO, TQ, TN, TP;
Chris@42 1193 TO = cr[WS(rs, 18)];
Chris@42 1194 TQ = ci[WS(rs, 18)];
Chris@42 1195 TN = W[34];
Chris@42 1196 TP = W[35];
Chris@42 1197 TR = FMA(TN, TO, TP * TQ);
Chris@42 1198 T3o = FNMS(TP, TO, TN * TQ);
Chris@42 1199 }
Chris@42 1200 {
Chris@42 1201 E TU, TW, TT, TV;
Chris@42 1202 TU = cr[WS(rs, 10)];
Chris@42 1203 TW = ci[WS(rs, 10)];
Chris@42 1204 TT = W[18];
Chris@42 1205 TV = W[19];
Chris@42 1206 TX = FMA(TT, TU, TV * TW);
Chris@42 1207 T3j = FNMS(TV, TU, TT * TW);
Chris@42 1208 }
Chris@42 1209 {
Chris@42 1210 E TS, T13, T5L, T5M;
Chris@42 1211 TS = TM + TR;
Chris@42 1212 T13 = TX + T12;
Chris@42 1213 T14 = TS + T13;
Chris@42 1214 T5K = TS - T13;
Chris@42 1215 T5L = T3n + T3o;
Chris@42 1216 T5M = T3j + T3k;
Chris@42 1217 T5N = T5L - T5M;
Chris@42 1218 T6F = T5L + T5M;
Chris@42 1219 }
Chris@42 1220 {
Chris@42 1221 E T3i, T3l, T3p, T3q;
Chris@42 1222 T3i = TM - TR;
Chris@42 1223 T3l = T3j - T3k;
Chris@42 1224 T3m = T3i + T3l;
Chris@42 1225 T4Z = T3i - T3l;
Chris@42 1226 T3p = T3n - T3o;
Chris@42 1227 T3q = TX - T12;
Chris@42 1228 T3r = T3p - T3q;
Chris@42 1229 T4Y = T3p + T3q;
Chris@42 1230 }
Chris@42 1231 }
Chris@42 1232 {
Chris@42 1233 E T19, T3t, T1p, T3A, T1e, T3u, T1k, T3z;
Chris@42 1234 {
Chris@42 1235 E T16, T18, T15, T17;
Chris@42 1236 T16 = cr[WS(rs, 30)];
Chris@42 1237 T18 = ci[WS(rs, 30)];
Chris@42 1238 T15 = W[58];
Chris@42 1239 T17 = W[59];
Chris@42 1240 T19 = FMA(T15, T16, T17 * T18);
Chris@42 1241 T3t = FNMS(T17, T16, T15 * T18);
Chris@42 1242 }
Chris@42 1243 {
Chris@42 1244 E T1m, T1o, T1l, T1n;
Chris@42 1245 T1m = cr[WS(rs, 22)];
Chris@42 1246 T1o = ci[WS(rs, 22)];
Chris@42 1247 T1l = W[42];
Chris@42 1248 T1n = W[43];
Chris@42 1249 T1p = FMA(T1l, T1m, T1n * T1o);
Chris@42 1250 T3A = FNMS(T1n, T1m, T1l * T1o);
Chris@42 1251 }
Chris@42 1252 {
Chris@42 1253 E T1b, T1d, T1a, T1c;
Chris@42 1254 T1b = cr[WS(rs, 14)];
Chris@42 1255 T1d = ci[WS(rs, 14)];
Chris@42 1256 T1a = W[26];
Chris@42 1257 T1c = W[27];
Chris@42 1258 T1e = FMA(T1a, T1b, T1c * T1d);
Chris@42 1259 T3u = FNMS(T1c, T1b, T1a * T1d);
Chris@42 1260 }
Chris@42 1261 {
Chris@42 1262 E T1h, T1j, T1g, T1i;
Chris@42 1263 T1h = cr[WS(rs, 6)];
Chris@42 1264 T1j = ci[WS(rs, 6)];
Chris@42 1265 T1g = W[10];
Chris@42 1266 T1i = W[11];
Chris@42 1267 T1k = FMA(T1g, T1h, T1i * T1j);
Chris@42 1268 T3z = FNMS(T1i, T1h, T1g * T1j);
Chris@42 1269 }
Chris@42 1270 {
Chris@42 1271 E T1f, T1q, T5Q, T5R;
Chris@42 1272 T1f = T19 + T1e;
Chris@42 1273 T1q = T1k + T1p;
Chris@42 1274 T1r = T1f + T1q;
Chris@42 1275 T5P = T1f - T1q;
Chris@42 1276 T5Q = T3t + T3u;
Chris@42 1277 T5R = T3z + T3A;
Chris@42 1278 T5S = T5Q - T5R;
Chris@42 1279 T6E = T5Q + T5R;
Chris@42 1280 }
Chris@42 1281 {
Chris@42 1282 E T3v, T3w, T3y, T3B;
Chris@42 1283 T3v = T3t - T3u;
Chris@42 1284 T3w = T1k - T1p;
Chris@42 1285 T3x = T3v - T3w;
Chris@42 1286 T52 = T3v + T3w;
Chris@42 1287 T3y = T19 - T1e;
Chris@42 1288 T3B = T3z - T3A;
Chris@42 1289 T3C = T3y + T3B;
Chris@42 1290 T51 = T3y - T3B;
Chris@42 1291 }
Chris@42 1292 }
Chris@42 1293 {
Chris@42 1294 E T1V, T3M, T20, T3N, T3L, T3O, T26, T3Q, T2b, T3R, T3S, T3T;
Chris@42 1295 {
Chris@42 1296 E T1S, T1U, T1R, T1T;
Chris@42 1297 T1S = cr[WS(rs, 5)];
Chris@42 1298 T1U = ci[WS(rs, 5)];
Chris@42 1299 T1R = W[8];
Chris@42 1300 T1T = W[9];
Chris@42 1301 T1V = FMA(T1R, T1S, T1T * T1U);
Chris@42 1302 T3M = FNMS(T1T, T1S, T1R * T1U);
Chris@42 1303 }
Chris@42 1304 {
Chris@42 1305 E T1X, T1Z, T1W, T1Y;
Chris@42 1306 T1X = cr[WS(rs, 21)];
Chris@42 1307 T1Z = ci[WS(rs, 21)];
Chris@42 1308 T1W = W[40];
Chris@42 1309 T1Y = W[41];
Chris@42 1310 T20 = FMA(T1W, T1X, T1Y * T1Z);
Chris@42 1311 T3N = FNMS(T1Y, T1X, T1W * T1Z);
Chris@42 1312 }
Chris@42 1313 T3L = T1V - T20;
Chris@42 1314 T3O = T3M - T3N;
Chris@42 1315 {
Chris@42 1316 E T23, T25, T22, T24;
Chris@42 1317 T23 = cr[WS(rs, 29)];
Chris@42 1318 T25 = ci[WS(rs, 29)];
Chris@42 1319 T22 = W[56];
Chris@42 1320 T24 = W[57];
Chris@42 1321 T26 = FMA(T22, T23, T24 * T25);
Chris@42 1322 T3Q = FNMS(T24, T23, T22 * T25);
Chris@42 1323 }
Chris@42 1324 {
Chris@42 1325 E T28, T2a, T27, T29;
Chris@42 1326 T28 = cr[WS(rs, 13)];
Chris@42 1327 T2a = ci[WS(rs, 13)];
Chris@42 1328 T27 = W[24];
Chris@42 1329 T29 = W[25];
Chris@42 1330 T2b = FMA(T27, T28, T29 * T2a);
Chris@42 1331 T3R = FNMS(T29, T28, T27 * T2a);
Chris@42 1332 }
Chris@42 1333 T3S = T3Q - T3R;
Chris@42 1334 T3T = T26 - T2b;
Chris@42 1335 {
Chris@42 1336 E T21, T2c, T62, T63;
Chris@42 1337 T21 = T1V + T20;
Chris@42 1338 T2c = T26 + T2b;
Chris@42 1339 T2d = T21 + T2c;
Chris@42 1340 T5Z = T21 - T2c;
Chris@42 1341 T62 = T3Q + T3R;
Chris@42 1342 T63 = T3M + T3N;
Chris@42 1343 T64 = T62 - T63;
Chris@42 1344 T6K = T63 + T62;
Chris@42 1345 }
Chris@42 1346 {
Chris@42 1347 E T3P, T3U, T42, T43;
Chris@42 1348 T3P = T3L + T3O;
Chris@42 1349 T3U = T3S - T3T;
Chris@42 1350 T3V = KP707106781 * (T3P - T3U);
Chris@42 1351 T5a = KP707106781 * (T3P + T3U);
Chris@42 1352 T42 = T3T + T3S;
Chris@42 1353 T43 = T3L - T3O;
Chris@42 1354 T44 = KP707106781 * (T42 - T43);
Chris@42 1355 T57 = KP707106781 * (T43 + T42);
Chris@42 1356 }
Chris@42 1357 }
Chris@42 1358 {
Chris@42 1359 E T2G, T4i, T2L, T4j, T4h, T4k, T2R, T4d, T2W, T4e, T4c, T4f;
Chris@42 1360 {
Chris@42 1361 E T2D, T2F, T2C, T2E;
Chris@42 1362 T2D = cr[WS(rs, 3)];
Chris@42 1363 T2F = ci[WS(rs, 3)];
Chris@42 1364 T2C = W[4];
Chris@42 1365 T2E = W[5];
Chris@42 1366 T2G = FMA(T2C, T2D, T2E * T2F);
Chris@42 1367 T4i = FNMS(T2E, T2D, T2C * T2F);
Chris@42 1368 }
Chris@42 1369 {
Chris@42 1370 E T2I, T2K, T2H, T2J;
Chris@42 1371 T2I = cr[WS(rs, 19)];
Chris@42 1372 T2K = ci[WS(rs, 19)];
Chris@42 1373 T2H = W[36];
Chris@42 1374 T2J = W[37];
Chris@42 1375 T2L = FMA(T2H, T2I, T2J * T2K);
Chris@42 1376 T4j = FNMS(T2J, T2I, T2H * T2K);
Chris@42 1377 }
Chris@42 1378 T4h = T2G - T2L;
Chris@42 1379 T4k = T4i - T4j;
Chris@42 1380 {
Chris@42 1381 E T2O, T2Q, T2N, T2P;
Chris@42 1382 T2O = cr[WS(rs, 27)];
Chris@42 1383 T2Q = ci[WS(rs, 27)];
Chris@42 1384 T2N = W[52];
Chris@42 1385 T2P = W[53];
Chris@42 1386 T2R = FMA(T2N, T2O, T2P * T2Q);
Chris@42 1387 T4d = FNMS(T2P, T2O, T2N * T2Q);
Chris@42 1388 }
Chris@42 1389 {
Chris@42 1390 E T2T, T2V, T2S, T2U;
Chris@42 1391 T2T = cr[WS(rs, 11)];
Chris@42 1392 T2V = ci[WS(rs, 11)];
Chris@42 1393 T2S = W[20];
Chris@42 1394 T2U = W[21];
Chris@42 1395 T2W = FMA(T2S, T2T, T2U * T2V);
Chris@42 1396 T4e = FNMS(T2U, T2T, T2S * T2V);
Chris@42 1397 }
Chris@42 1398 T4c = T2R - T2W;
Chris@42 1399 T4f = T4d - T4e;
Chris@42 1400 {
Chris@42 1401 E T2M, T2X, T68, T69;
Chris@42 1402 T2M = T2G + T2L;
Chris@42 1403 T2X = T2R + T2W;
Chris@42 1404 T2Y = T2M + T2X;
Chris@42 1405 T6f = T2M - T2X;
Chris@42 1406 T68 = T4d + T4e;
Chris@42 1407 T69 = T4i + T4j;
Chris@42 1408 T6a = T68 - T69;
Chris@42 1409 T6P = T69 + T68;
Chris@42 1410 }
Chris@42 1411 {
Chris@42 1412 E T4g, T4l, T4t, T4u;
Chris@42 1413 T4g = T4c + T4f;
Chris@42 1414 T4l = T4h - T4k;
Chris@42 1415 T4m = KP707106781 * (T4g - T4l);
Chris@42 1416 T5e = KP707106781 * (T4l + T4g);
Chris@42 1417 T4t = T4h + T4k;
Chris@42 1418 T4u = T4f - T4c;
Chris@42 1419 T4v = KP707106781 * (T4t - T4u);
Chris@42 1420 T5h = KP707106781 * (T4t + T4u);
Chris@42 1421 }
Chris@42 1422 }
Chris@42 1423 {
Chris@42 1424 E T1t, T6X, T7a, T7c, T30, T7b, T70, T71;
Chris@42 1425 {
Chris@42 1426 E TH, T1s, T72, T79;
Chris@42 1427 TH = Tj + TG;
Chris@42 1428 T1s = T14 + T1r;
Chris@42 1429 T1t = TH + T1s;
Chris@42 1430 T6X = TH - T1s;
Chris@42 1431 T72 = T6F + T6E;
Chris@42 1432 T79 = T73 + T78;
Chris@42 1433 T7a = T72 + T79;
Chris@42 1434 T7c = T79 - T72;
Chris@42 1435 }
Chris@42 1436 {
Chris@42 1437 E T2e, T2Z, T6Y, T6Z;
Chris@42 1438 T2e = T1Q + T2d;
Chris@42 1439 T2Z = T2B + T2Y;
Chris@42 1440 T30 = T2e + T2Z;
Chris@42 1441 T7b = T2Z - T2e;
Chris@42 1442 T6Y = T6O + T6P;
Chris@42 1443 T6Z = T6J + T6K;
Chris@42 1444 T70 = T6Y - T6Z;
Chris@42 1445 T71 = T6Z + T6Y;
Chris@42 1446 }
Chris@42 1447 ci[WS(rs, 15)] = T1t - T30;
Chris@42 1448 cr[WS(rs, 24)] = T7b - T7c;
Chris@42 1449 ci[WS(rs, 23)] = T7b + T7c;
Chris@42 1450 cr[0] = T1t + T30;
Chris@42 1451 cr[WS(rs, 8)] = T6X - T70;
Chris@42 1452 cr[WS(rs, 16)] = T71 - T7a;
Chris@42 1453 ci[WS(rs, 31)] = T71 + T7a;
Chris@42 1454 ci[WS(rs, 7)] = T6X + T70;
Chris@42 1455 }
Chris@42 1456 {
Chris@42 1457 E T4X, T5p, T7D, T7J, T54, T7y, T5z, T5D, T5c, T5m, T5s, T7I, T5w, T5C, T5j;
Chris@42 1458 E T5n, T4W, T7z;
Chris@42 1459 T4W = KP707106781 * (T4U + T4V);
Chris@42 1460 T4X = T4T - T4W;
Chris@42 1461 T5p = T4T + T4W;
Chris@42 1462 T7z = KP707106781 * (T3a - T3f);
Chris@42 1463 T7D = T7z + T7C;
Chris@42 1464 T7J = T7C - T7z;
Chris@42 1465 {
Chris@42 1466 E T50, T53, T5x, T5y;
Chris@42 1467 T50 = FMA(KP923879532, T4Y, KP382683432 * T4Z);
Chris@42 1468 T53 = FNMS(KP923879532, T52, KP382683432 * T51);
Chris@42 1469 T54 = T50 + T53;
Chris@42 1470 T7y = T50 - T53;
Chris@42 1471 T5x = T5d + T5e;
Chris@42 1472 T5y = T5g + T5h;
Chris@42 1473 T5z = FNMS(KP980785280, T5y, KP195090322 * T5x);
Chris@42 1474 T5D = FMA(KP980785280, T5x, KP195090322 * T5y);
Chris@42 1475 }
Chris@42 1476 {
Chris@42 1477 E T58, T5b, T5q, T5r;
Chris@42 1478 T58 = T56 - T57;
Chris@42 1479 T5b = T59 - T5a;
Chris@42 1480 T5c = FMA(KP831469612, T58, KP555570233 * T5b);
Chris@42 1481 T5m = FNMS(KP831469612, T5b, KP555570233 * T58);
Chris@42 1482 T5q = FNMS(KP382683432, T4Y, KP923879532 * T4Z);
Chris@42 1483 T5r = FMA(KP382683432, T52, KP923879532 * T51);
Chris@42 1484 T5s = T5q + T5r;
Chris@42 1485 T7I = T5r - T5q;
Chris@42 1486 }
Chris@42 1487 {
Chris@42 1488 E T5u, T5v, T5f, T5i;
Chris@42 1489 T5u = T56 + T57;
Chris@42 1490 T5v = T59 + T5a;
Chris@42 1491 T5w = FMA(KP195090322, T5u, KP980785280 * T5v);
Chris@42 1492 T5C = FNMS(KP195090322, T5v, KP980785280 * T5u);
Chris@42 1493 T5f = T5d - T5e;
Chris@42 1494 T5i = T5g - T5h;
Chris@42 1495 T5j = FNMS(KP555570233, T5i, KP831469612 * T5f);
Chris@42 1496 T5n = FMA(KP555570233, T5f, KP831469612 * T5i);
Chris@42 1497 }
Chris@42 1498 {
Chris@42 1499 E T55, T5k, T7H, T7K;
Chris@42 1500 T55 = T4X + T54;
Chris@42 1501 T5k = T5c + T5j;
Chris@42 1502 ci[WS(rs, 12)] = T55 - T5k;
Chris@42 1503 cr[WS(rs, 3)] = T55 + T5k;
Chris@42 1504 T7H = T5n - T5m;
Chris@42 1505 T7K = T7I + T7J;
Chris@42 1506 cr[WS(rs, 19)] = T7H - T7K;
Chris@42 1507 ci[WS(rs, 28)] = T7H + T7K;
Chris@42 1508 }
Chris@42 1509 {
Chris@42 1510 E T7L, T7M, T5l, T5o;
Chris@42 1511 T7L = T5j - T5c;
Chris@42 1512 T7M = T7J - T7I;
Chris@42 1513 cr[WS(rs, 27)] = T7L - T7M;
Chris@42 1514 ci[WS(rs, 20)] = T7L + T7M;
Chris@42 1515 T5l = T4X - T54;
Chris@42 1516 T5o = T5m + T5n;
Chris@42 1517 cr[WS(rs, 11)] = T5l - T5o;
Chris@42 1518 ci[WS(rs, 4)] = T5l + T5o;
Chris@42 1519 }
Chris@42 1520 {
Chris@42 1521 E T5t, T5A, T7x, T7E;
Chris@42 1522 T5t = T5p - T5s;
Chris@42 1523 T5A = T5w + T5z;
Chris@42 1524 ci[WS(rs, 8)] = T5t - T5A;
Chris@42 1525 cr[WS(rs, 7)] = T5t + T5A;
Chris@42 1526 T7x = T5z - T5w;
Chris@42 1527 T7E = T7y + T7D;
Chris@42 1528 cr[WS(rs, 31)] = T7x - T7E;
Chris@42 1529 ci[WS(rs, 16)] = T7x + T7E;
Chris@42 1530 }
Chris@42 1531 {
Chris@42 1532 E T7F, T7G, T5B, T5E;
Chris@42 1533 T7F = T5D - T5C;
Chris@42 1534 T7G = T7D - T7y;
Chris@42 1535 cr[WS(rs, 23)] = T7F - T7G;
Chris@42 1536 ci[WS(rs, 24)] = T7F + T7G;
Chris@42 1537 T5B = T5p + T5s;
Chris@42 1538 T5E = T5C + T5D;
Chris@42 1539 cr[WS(rs, 15)] = T5B - T5E;
Chris@42 1540 ci[0] = T5B + T5E;
Chris@42 1541 }
Chris@42 1542 }
Chris@42 1543 {
Chris@42 1544 E T6H, T6T, T7g, T7i, T6M, T6U, T6R, T6V;
Chris@42 1545 {
Chris@42 1546 E T6D, T6G, T7e, T7f;
Chris@42 1547 T6D = Tj - TG;
Chris@42 1548 T6G = T6E - T6F;
Chris@42 1549 T6H = T6D - T6G;
Chris@42 1550 T6T = T6D + T6G;
Chris@42 1551 T7e = T14 - T1r;
Chris@42 1552 T7f = T78 - T73;
Chris@42 1553 T7g = T7e + T7f;
Chris@42 1554 T7i = T7f - T7e;
Chris@42 1555 }
Chris@42 1556 {
Chris@42 1557 E T6I, T6L, T6N, T6Q;
Chris@42 1558 T6I = T1Q - T2d;
Chris@42 1559 T6L = T6J - T6K;
Chris@42 1560 T6M = T6I + T6L;
Chris@42 1561 T6U = T6I - T6L;
Chris@42 1562 T6N = T2B - T2Y;
Chris@42 1563 T6Q = T6O - T6P;
Chris@42 1564 T6R = T6N - T6Q;
Chris@42 1565 T6V = T6N + T6Q;
Chris@42 1566 }
Chris@42 1567 {
Chris@42 1568 E T6S, T7h, T6W, T7d;
Chris@42 1569 T6S = KP707106781 * (T6M + T6R);
Chris@42 1570 ci[WS(rs, 11)] = T6H - T6S;
Chris@42 1571 cr[WS(rs, 4)] = T6H + T6S;
Chris@42 1572 T7h = KP707106781 * (T6V - T6U);
Chris@42 1573 cr[WS(rs, 20)] = T7h - T7i;
Chris@42 1574 ci[WS(rs, 27)] = T7h + T7i;
Chris@42 1575 T6W = KP707106781 * (T6U + T6V);
Chris@42 1576 cr[WS(rs, 12)] = T6T - T6W;
Chris@42 1577 ci[WS(rs, 3)] = T6T + T6W;
Chris@42 1578 T7d = KP707106781 * (T6R - T6M);
Chris@42 1579 cr[WS(rs, 28)] = T7d - T7g;
Chris@42 1580 ci[WS(rs, 19)] = T7d + T7g;
Chris@42 1581 }
Chris@42 1582 }
Chris@42 1583 {
Chris@42 1584 E T5J, T7n, T7t, T6n, T5U, T7k, T6x, T6B, T6q, T7s, T66, T6k, T6u, T6A, T6h;
Chris@42 1585 E T6l;
Chris@42 1586 {
Chris@42 1587 E T5O, T5T, T60, T65;
Chris@42 1588 T5J = T5F - T5I;
Chris@42 1589 T7n = T7l + T7m;
Chris@42 1590 T7t = T7m - T7l;
Chris@42 1591 T6n = T5F + T5I;
Chris@42 1592 T5O = T5K + T5N;
Chris@42 1593 T5T = T5P - T5S;
Chris@42 1594 T5U = KP707106781 * (T5O + T5T);
Chris@42 1595 T7k = KP707106781 * (T5O - T5T);
Chris@42 1596 {
Chris@42 1597 E T6v, T6w, T6o, T6p;
Chris@42 1598 T6v = T6e + T6f;
Chris@42 1599 T6w = T67 + T6a;
Chris@42 1600 T6x = FMA(KP382683432, T6v, KP923879532 * T6w);
Chris@42 1601 T6B = FNMS(KP923879532, T6v, KP382683432 * T6w);
Chris@42 1602 T6o = T5K - T5N;
Chris@42 1603 T6p = T5P + T5S;
Chris@42 1604 T6q = KP707106781 * (T6o + T6p);
Chris@42 1605 T7s = KP707106781 * (T6p - T6o);
Chris@42 1606 }
Chris@42 1607 T60 = T5Y - T5Z;
Chris@42 1608 T65 = T61 - T64;
Chris@42 1609 T66 = FMA(KP382683432, T60, KP923879532 * T65);
Chris@42 1610 T6k = FNMS(KP923879532, T60, KP382683432 * T65);
Chris@42 1611 {
Chris@42 1612 E T6s, T6t, T6b, T6g;
Chris@42 1613 T6s = T61 + T64;
Chris@42 1614 T6t = T5Y + T5Z;
Chris@42 1615 T6u = FNMS(KP382683432, T6t, KP923879532 * T6s);
Chris@42 1616 T6A = FMA(KP923879532, T6t, KP382683432 * T6s);
Chris@42 1617 T6b = T67 - T6a;
Chris@42 1618 T6g = T6e - T6f;
Chris@42 1619 T6h = FNMS(KP382683432, T6g, KP923879532 * T6b);
Chris@42 1620 T6l = FMA(KP923879532, T6g, KP382683432 * T6b);
Chris@42 1621 }
Chris@42 1622 }
Chris@42 1623 {
Chris@42 1624 E T5V, T6i, T7r, T7u;
Chris@42 1625 T5V = T5J + T5U;
Chris@42 1626 T6i = T66 + T6h;
Chris@42 1627 ci[WS(rs, 13)] = T5V - T6i;
Chris@42 1628 cr[WS(rs, 2)] = T5V + T6i;
Chris@42 1629 T7r = T6l - T6k;
Chris@42 1630 T7u = T7s + T7t;
Chris@42 1631 cr[WS(rs, 18)] = T7r - T7u;
Chris@42 1632 ci[WS(rs, 29)] = T7r + T7u;
Chris@42 1633 }
Chris@42 1634 {
Chris@42 1635 E T7v, T7w, T6j, T6m;
Chris@42 1636 T7v = T6h - T66;
Chris@42 1637 T7w = T7t - T7s;
Chris@42 1638 cr[WS(rs, 26)] = T7v - T7w;
Chris@42 1639 ci[WS(rs, 21)] = T7v + T7w;
Chris@42 1640 T6j = T5J - T5U;
Chris@42 1641 T6m = T6k + T6l;
Chris@42 1642 cr[WS(rs, 10)] = T6j - T6m;
Chris@42 1643 ci[WS(rs, 5)] = T6j + T6m;
Chris@42 1644 }
Chris@42 1645 {
Chris@42 1646 E T6r, T6y, T7j, T7o;
Chris@42 1647 T6r = T6n + T6q;
Chris@42 1648 T6y = T6u + T6x;
Chris@42 1649 cr[WS(rs, 14)] = T6r - T6y;
Chris@42 1650 ci[WS(rs, 1)] = T6r + T6y;
Chris@42 1651 T7j = T6B - T6A;
Chris@42 1652 T7o = T7k + T7n;
Chris@42 1653 cr[WS(rs, 30)] = T7j - T7o;
Chris@42 1654 ci[WS(rs, 17)] = T7j + T7o;
Chris@42 1655 }
Chris@42 1656 {
Chris@42 1657 E T7p, T7q, T6z, T6C;
Chris@42 1658 T7p = T6x - T6u;
Chris@42 1659 T7q = T7n - T7k;
Chris@42 1660 cr[WS(rs, 22)] = T7p - T7q;
Chris@42 1661 ci[WS(rs, 25)] = T7p + T7q;
Chris@42 1662 T6z = T6n - T6q;
Chris@42 1663 T6C = T6A + T6B;
Chris@42 1664 ci[WS(rs, 9)] = T6z - T6C;
Chris@42 1665 cr[WS(rs, 6)] = T6z + T6C;
Chris@42 1666 }
Chris@42 1667 }
Chris@42 1668 {
Chris@42 1669 E T3h, T4D, T7R, T7X, T3E, T7O, T4N, T4R, T46, T4A, T4G, T7W, T4K, T4Q, T4x;
Chris@42 1670 E T4B, T3g, T7P;
Chris@42 1671 T3g = KP707106781 * (T3a + T3f);
Chris@42 1672 T3h = T35 - T3g;
Chris@42 1673 T4D = T35 + T3g;
Chris@42 1674 T7P = KP707106781 * (T4V - T4U);
Chris@42 1675 T7R = T7P + T7Q;
Chris@42 1676 T7X = T7Q - T7P;
Chris@42 1677 {
Chris@42 1678 E T3s, T3D, T4L, T4M;
Chris@42 1679 T3s = FNMS(KP923879532, T3r, KP382683432 * T3m);
Chris@42 1680 T3D = FMA(KP923879532, T3x, KP382683432 * T3C);
Chris@42 1681 T3E = T3s + T3D;
Chris@42 1682 T7O = T3D - T3s;
Chris@42 1683 T4L = T4s + T4v;
Chris@42 1684 T4M = T4b + T4m;
Chris@42 1685 T4N = FNMS(KP195090322, T4M, KP980785280 * T4L);
Chris@42 1686 T4R = FMA(KP980785280, T4M, KP195090322 * T4L);
Chris@42 1687 }
Chris@42 1688 {
Chris@42 1689 E T3W, T45, T4E, T4F;
Chris@42 1690 T3W = T3K - T3V;
Chris@42 1691 T45 = T41 - T44;
Chris@42 1692 T46 = FNMS(KP555570233, T45, KP831469612 * T3W);
Chris@42 1693 T4A = FMA(KP831469612, T45, KP555570233 * T3W);
Chris@42 1694 T4E = FMA(KP382683432, T3r, KP923879532 * T3m);
Chris@42 1695 T4F = FNMS(KP382683432, T3x, KP923879532 * T3C);
Chris@42 1696 T4G = T4E + T4F;
Chris@42 1697 T7W = T4E - T4F;
Chris@42 1698 }
Chris@42 1699 {
Chris@42 1700 E T4I, T4J, T4n, T4w;
Chris@42 1701 T4I = T41 + T44;
Chris@42 1702 T4J = T3K + T3V;
Chris@42 1703 T4K = FMA(KP195090322, T4I, KP980785280 * T4J);
Chris@42 1704 T4Q = FNMS(KP980785280, T4I, KP195090322 * T4J);
Chris@42 1705 T4n = T4b - T4m;
Chris@42 1706 T4w = T4s - T4v;
Chris@42 1707 T4x = FMA(KP555570233, T4n, KP831469612 * T4w);
Chris@42 1708 T4B = FNMS(KP831469612, T4n, KP555570233 * T4w);
Chris@42 1709 }
Chris@42 1710 {
Chris@42 1711 E T3F, T4y, T7V, T7Y;
Chris@42 1712 T3F = T3h + T3E;
Chris@42 1713 T4y = T46 + T4x;
Chris@42 1714 cr[WS(rs, 13)] = T3F - T4y;
Chris@42 1715 ci[WS(rs, 2)] = T3F + T4y;
Chris@42 1716 T7V = T4B - T4A;
Chris@42 1717 T7Y = T7W + T7X;
Chris@42 1718 cr[WS(rs, 29)] = T7V - T7Y;
Chris@42 1719 ci[WS(rs, 18)] = T7V + T7Y;
Chris@42 1720 }
Chris@42 1721 {
Chris@42 1722 E T7Z, T80, T4z, T4C;
Chris@42 1723 T7Z = T4x - T46;
Chris@42 1724 T80 = T7X - T7W;
Chris@42 1725 cr[WS(rs, 21)] = T7Z - T80;
Chris@42 1726 ci[WS(rs, 26)] = T7Z + T80;
Chris@42 1727 T4z = T3h - T3E;
Chris@42 1728 T4C = T4A + T4B;
Chris@42 1729 ci[WS(rs, 10)] = T4z - T4C;
Chris@42 1730 cr[WS(rs, 5)] = T4z + T4C;
Chris@42 1731 }
Chris@42 1732 {
Chris@42 1733 E T4H, T4O, T7N, T7S;
Chris@42 1734 T4H = T4D + T4G;
Chris@42 1735 T4O = T4K + T4N;
Chris@42 1736 ci[WS(rs, 14)] = T4H - T4O;
Chris@42 1737 cr[WS(rs, 1)] = T4H + T4O;
Chris@42 1738 T7N = T4R - T4Q;
Chris@42 1739 T7S = T7O + T7R;
Chris@42 1740 cr[WS(rs, 17)] = T7N - T7S;
Chris@42 1741 ci[WS(rs, 30)] = T7N + T7S;
Chris@42 1742 }
Chris@42 1743 {
Chris@42 1744 E T7T, T7U, T4P, T4S;
Chris@42 1745 T7T = T4N - T4K;
Chris@42 1746 T7U = T7R - T7O;
Chris@42 1747 cr[WS(rs, 25)] = T7T - T7U;
Chris@42 1748 ci[WS(rs, 22)] = T7T + T7U;
Chris@42 1749 T4P = T4D - T4G;
Chris@42 1750 T4S = T4Q + T4R;
Chris@42 1751 cr[WS(rs, 9)] = T4P - T4S;
Chris@42 1752 ci[WS(rs, 6)] = T4P + T4S;
Chris@42 1753 }
Chris@42 1754 }
Chris@42 1755 }
Chris@42 1756 }
Chris@42 1757 }
Chris@42 1758
Chris@42 1759 static const tw_instr twinstr[] = {
Chris@42 1760 {TW_FULL, 1, 32},
Chris@42 1761 {TW_NEXT, 1, 0}
Chris@42 1762 };
Chris@42 1763
Chris@42 1764 static const hc2hc_desc desc = { 32, "hf_32", twinstr, &GENUS, {340, 114, 94, 0} };
Chris@42 1765
Chris@42 1766 void X(codelet_hf_32) (planner *p) {
Chris@42 1767 X(khc2hc_register) (p, hf_32, &desc);
Chris@42 1768 }
Chris@42 1769 #endif /* HAVE_FMA */