annotate src/fftw-3.3.3/dft/scalar/codelets/t1_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:35:51 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 32 -name t1_32 -include t.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 434 FP additions, 260 FP multiplications,
Chris@10 32 * (or, 236 additions, 62 multiplications, 198 fused multiply/add),
Chris@10 33 * 135 stack variables, 7 constants, and 128 memory accesses
Chris@10 34 */
Chris@10 35 #include "t.h"
Chris@10 36
Chris@10 37 static void t1_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 40 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 41 DK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@10 42 DK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@10 43 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 44 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 45 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 46 {
Chris@10 47 INT m;
Chris@10 48 for (m = mb, W = W + (mb * 62); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 49 E T90, T8Z;
Chris@10 50 {
Chris@10 51 E T8x, T87, T8, T3w, T83, T3B, T8y, Tl, T6F, Tz, T3J, T5T, T6G, TM, T3Q;
Chris@10 52 E T5U, T46, T5Y, T7D, T6L, T5X, T3Z, T6M, T1f, T7E, T6R, T60, T4e, T6O, T1G;
Chris@10 53 E T61, T4l, T78, T7N, T54, T6f, T32, T7b, T6c, T5r, T6X, T7I, T4v, T68, T29;
Chris@10 54 E T70, T65, T4S, T5s, T5b, T7O, T7e, T79, T3t, T5t, T5i, T4H, T2y, T4A, T71;
Chris@10 55 E T2m, T4B, T4F, T2s;
Chris@10 56 {
Chris@10 57 E T44, T1d, T3X, T6J, T11, T40, T42, T17, T5h, T5c;
Chris@10 58 {
Chris@10 59 E Ta, Td, Tg, T3x, Tb, Tj, Tf, Tc, Ti;
Chris@10 60 {
Chris@10 61 E T1, T86, T3, T6, T2, T5;
Chris@10 62 T1 = ri[0];
Chris@10 63 T86 = ii[0];
Chris@10 64 T3 = ri[WS(rs, 16)];
Chris@10 65 T6 = ii[WS(rs, 16)];
Chris@10 66 T2 = W[30];
Chris@10 67 T5 = W[31];
Chris@10 68 {
Chris@10 69 E T84, T4, T9, T85, T7;
Chris@10 70 Ta = ri[WS(rs, 8)];
Chris@10 71 Td = ii[WS(rs, 8)];
Chris@10 72 T84 = T2 * T6;
Chris@10 73 T4 = T2 * T3;
Chris@10 74 T9 = W[14];
Chris@10 75 Tg = ri[WS(rs, 24)];
Chris@10 76 T85 = FNMS(T5, T3, T84);
Chris@10 77 T7 = FMA(T5, T6, T4);
Chris@10 78 T3x = T9 * Td;
Chris@10 79 Tb = T9 * Ta;
Chris@10 80 T8x = T86 - T85;
Chris@10 81 T87 = T85 + T86;
Chris@10 82 T8 = T1 + T7;
Chris@10 83 T3w = T1 - T7;
Chris@10 84 Tj = ii[WS(rs, 24)];
Chris@10 85 Tf = W[46];
Chris@10 86 }
Chris@10 87 Tc = W[15];
Chris@10 88 Ti = W[47];
Chris@10 89 }
Chris@10 90 {
Chris@10 91 E Tu, Tx, T3F, Ts, Tw, T3G, Tv;
Chris@10 92 {
Chris@10 93 E To, Tr, Tp, T3E, Tq, Tt;
Chris@10 94 {
Chris@10 95 E T3y, Te, T3A, Tk, T3z, Th, Tn;
Chris@10 96 To = ri[WS(rs, 4)];
Chris@10 97 T3z = Tf * Tj;
Chris@10 98 Th = Tf * Tg;
Chris@10 99 T3y = FNMS(Tc, Ta, T3x);
Chris@10 100 Te = FMA(Tc, Td, Tb);
Chris@10 101 T3A = FNMS(Ti, Tg, T3z);
Chris@10 102 Tk = FMA(Ti, Tj, Th);
Chris@10 103 Tr = ii[WS(rs, 4)];
Chris@10 104 Tn = W[6];
Chris@10 105 T83 = T3y + T3A;
Chris@10 106 T3B = T3y - T3A;
Chris@10 107 T8y = Te - Tk;
Chris@10 108 Tl = Te + Tk;
Chris@10 109 Tp = Tn * To;
Chris@10 110 T3E = Tn * Tr;
Chris@10 111 }
Chris@10 112 Tq = W[7];
Chris@10 113 Tu = ri[WS(rs, 20)];
Chris@10 114 Tx = ii[WS(rs, 20)];
Chris@10 115 Tt = W[38];
Chris@10 116 T3F = FNMS(Tq, To, T3E);
Chris@10 117 Ts = FMA(Tq, Tr, Tp);
Chris@10 118 Tw = W[39];
Chris@10 119 T3G = Tt * Tx;
Chris@10 120 Tv = Tt * Tu;
Chris@10 121 }
Chris@10 122 {
Chris@10 123 E T3M, TF, TH, TK, TG, TJ, TE, TD, TC;
Chris@10 124 {
Chris@10 125 E TB, T3H, Ty, TA, T3I, T3D, T3L;
Chris@10 126 TB = ri[WS(rs, 28)];
Chris@10 127 TE = ii[WS(rs, 28)];
Chris@10 128 T3H = FNMS(Tw, Tu, T3G);
Chris@10 129 Ty = FMA(Tw, Tx, Tv);
Chris@10 130 TA = W[54];
Chris@10 131 TD = W[55];
Chris@10 132 T6F = T3F + T3H;
Chris@10 133 T3I = T3F - T3H;
Chris@10 134 Tz = Ts + Ty;
Chris@10 135 T3D = Ts - Ty;
Chris@10 136 T3L = TA * TE;
Chris@10 137 TC = TA * TB;
Chris@10 138 T3J = T3D + T3I;
Chris@10 139 T5T = T3I - T3D;
Chris@10 140 T3M = FNMS(TD, TB, T3L);
Chris@10 141 }
Chris@10 142 TF = FMA(TD, TE, TC);
Chris@10 143 TH = ri[WS(rs, 12)];
Chris@10 144 TK = ii[WS(rs, 12)];
Chris@10 145 TG = W[22];
Chris@10 146 TJ = W[23];
Chris@10 147 {
Chris@10 148 E TU, T3U, T13, T16, T3W, T10, T12, T15, T41, T14;
Chris@10 149 {
Chris@10 150 E T19, T1c, T18, T1b, T3P, T3K;
Chris@10 151 {
Chris@10 152 E TQ, TT, T3N, TI, TP, TS;
Chris@10 153 TQ = ri[WS(rs, 2)];
Chris@10 154 TT = ii[WS(rs, 2)];
Chris@10 155 T3N = TG * TK;
Chris@10 156 TI = TG * TH;
Chris@10 157 TP = W[2];
Chris@10 158 TS = W[3];
Chris@10 159 {
Chris@10 160 E T3O, TL, T3T, TR;
Chris@10 161 T3O = FNMS(TJ, TH, T3N);
Chris@10 162 TL = FMA(TJ, TK, TI);
Chris@10 163 T3T = TP * TT;
Chris@10 164 TR = TP * TQ;
Chris@10 165 T6G = T3M + T3O;
Chris@10 166 T3P = T3M - T3O;
Chris@10 167 TM = TF + TL;
Chris@10 168 T3K = TF - TL;
Chris@10 169 TU = FMA(TS, TT, TR);
Chris@10 170 T3U = FNMS(TS, TQ, T3T);
Chris@10 171 }
Chris@10 172 }
Chris@10 173 T3Q = T3K - T3P;
Chris@10 174 T5U = T3K + T3P;
Chris@10 175 T19 = ri[WS(rs, 26)];
Chris@10 176 T1c = ii[WS(rs, 26)];
Chris@10 177 T18 = W[50];
Chris@10 178 T1b = W[51];
Chris@10 179 {
Chris@10 180 E TW, TZ, TY, T3V, TX, T43, T1a, TV;
Chris@10 181 TW = ri[WS(rs, 18)];
Chris@10 182 TZ = ii[WS(rs, 18)];
Chris@10 183 T43 = T18 * T1c;
Chris@10 184 T1a = T18 * T19;
Chris@10 185 TV = W[34];
Chris@10 186 TY = W[35];
Chris@10 187 T44 = FNMS(T1b, T19, T43);
Chris@10 188 T1d = FMA(T1b, T1c, T1a);
Chris@10 189 T3V = TV * TZ;
Chris@10 190 TX = TV * TW;
Chris@10 191 T13 = ri[WS(rs, 10)];
Chris@10 192 T16 = ii[WS(rs, 10)];
Chris@10 193 T3W = FNMS(TY, TW, T3V);
Chris@10 194 T10 = FMA(TY, TZ, TX);
Chris@10 195 T12 = W[18];
Chris@10 196 T15 = W[19];
Chris@10 197 }
Chris@10 198 }
Chris@10 199 T3X = T3U - T3W;
Chris@10 200 T6J = T3U + T3W;
Chris@10 201 T11 = TU + T10;
Chris@10 202 T40 = TU - T10;
Chris@10 203 T41 = T12 * T16;
Chris@10 204 T14 = T12 * T13;
Chris@10 205 T42 = FNMS(T15, T13, T41);
Chris@10 206 T17 = FMA(T15, T16, T14);
Chris@10 207 }
Chris@10 208 }
Chris@10 209 }
Chris@10 210 }
Chris@10 211 {
Chris@10 212 E T49, T1l, T4j, T1E, T1u, T1x, T1w, T4b, T1r, T4g, T1v;
Chris@10 213 {
Chris@10 214 E T1A, T1D, T1C, T4i, T1B;
Chris@10 215 {
Chris@10 216 E T1h, T1k, T1g, T1j, T48, T1i, T1z;
Chris@10 217 T1h = ri[WS(rs, 30)];
Chris@10 218 T1k = ii[WS(rs, 30)];
Chris@10 219 {
Chris@10 220 E T6K, T45, T1e, T3Y;
Chris@10 221 T6K = T42 + T44;
Chris@10 222 T45 = T42 - T44;
Chris@10 223 T1e = T17 + T1d;
Chris@10 224 T3Y = T17 - T1d;
Chris@10 225 T46 = T40 + T45;
Chris@10 226 T5Y = T40 - T45;
Chris@10 227 T7D = T6J + T6K;
Chris@10 228 T6L = T6J - T6K;
Chris@10 229 T5X = T3X + T3Y;
Chris@10 230 T3Z = T3X - T3Y;
Chris@10 231 T6M = T11 - T1e;
Chris@10 232 T1f = T11 + T1e;
Chris@10 233 T1g = W[58];
Chris@10 234 }
Chris@10 235 T1j = W[59];
Chris@10 236 T1A = ri[WS(rs, 22)];
Chris@10 237 T1D = ii[WS(rs, 22)];
Chris@10 238 T48 = T1g * T1k;
Chris@10 239 T1i = T1g * T1h;
Chris@10 240 T1z = W[42];
Chris@10 241 T1C = W[43];
Chris@10 242 T49 = FNMS(T1j, T1h, T48);
Chris@10 243 T1l = FMA(T1j, T1k, T1i);
Chris@10 244 T4i = T1z * T1D;
Chris@10 245 T1B = T1z * T1A;
Chris@10 246 }
Chris@10 247 {
Chris@10 248 E T1n, T1q, T1m, T1p, T4a, T1o, T1t;
Chris@10 249 T1n = ri[WS(rs, 14)];
Chris@10 250 T1q = ii[WS(rs, 14)];
Chris@10 251 T4j = FNMS(T1C, T1A, T4i);
Chris@10 252 T1E = FMA(T1C, T1D, T1B);
Chris@10 253 T1m = W[26];
Chris@10 254 T1p = W[27];
Chris@10 255 T1u = ri[WS(rs, 6)];
Chris@10 256 T1x = ii[WS(rs, 6)];
Chris@10 257 T4a = T1m * T1q;
Chris@10 258 T1o = T1m * T1n;
Chris@10 259 T1t = W[10];
Chris@10 260 T1w = W[11];
Chris@10 261 T4b = FNMS(T1p, T1n, T4a);
Chris@10 262 T1r = FMA(T1p, T1q, T1o);
Chris@10 263 T4g = T1t * T1x;
Chris@10 264 T1v = T1t * T1u;
Chris@10 265 }
Chris@10 266 }
Chris@10 267 {
Chris@10 268 E T4c, T6P, T1s, T4f, T4h, T1y;
Chris@10 269 T4c = T49 - T4b;
Chris@10 270 T6P = T49 + T4b;
Chris@10 271 T1s = T1l + T1r;
Chris@10 272 T4f = T1l - T1r;
Chris@10 273 T4h = FNMS(T1w, T1u, T4g);
Chris@10 274 T1y = FMA(T1w, T1x, T1v);
Chris@10 275 {
Chris@10 276 E T4k, T6Q, T4d, T1F;
Chris@10 277 T4k = T4h - T4j;
Chris@10 278 T6Q = T4h + T4j;
Chris@10 279 T4d = T1y - T1E;
Chris@10 280 T1F = T1y + T1E;
Chris@10 281 T7E = T6P + T6Q;
Chris@10 282 T6R = T6P - T6Q;
Chris@10 283 T60 = T4c + T4d;
Chris@10 284 T4e = T4c - T4d;
Chris@10 285 T6O = T1s - T1F;
Chris@10 286 T1G = T1s + T1F;
Chris@10 287 T61 = T4f - T4k;
Chris@10 288 T4l = T4f + T4k;
Chris@10 289 }
Chris@10 290 }
Chris@10 291 }
Chris@10 292 {
Chris@10 293 E T4Z, T2H, T5p, T30, T2Q, T2T, T2S, T51, T2N, T5m, T2R;
Chris@10 294 {
Chris@10 295 E T2W, T2Z, T2Y, T5o, T2X;
Chris@10 296 {
Chris@10 297 E T2D, T2G, T2C, T2F, T4Y, T2E, T2V;
Chris@10 298 T2D = ri[WS(rs, 31)];
Chris@10 299 T2G = ii[WS(rs, 31)];
Chris@10 300 T2C = W[60];
Chris@10 301 T2F = W[61];
Chris@10 302 T2W = ri[WS(rs, 23)];
Chris@10 303 T2Z = ii[WS(rs, 23)];
Chris@10 304 T4Y = T2C * T2G;
Chris@10 305 T2E = T2C * T2D;
Chris@10 306 T2V = W[44];
Chris@10 307 T2Y = W[45];
Chris@10 308 T4Z = FNMS(T2F, T2D, T4Y);
Chris@10 309 T2H = FMA(T2F, T2G, T2E);
Chris@10 310 T5o = T2V * T2Z;
Chris@10 311 T2X = T2V * T2W;
Chris@10 312 }
Chris@10 313 {
Chris@10 314 E T2J, T2M, T2I, T2L, T50, T2K, T2P;
Chris@10 315 T2J = ri[WS(rs, 15)];
Chris@10 316 T2M = ii[WS(rs, 15)];
Chris@10 317 T5p = FNMS(T2Y, T2W, T5o);
Chris@10 318 T30 = FMA(T2Y, T2Z, T2X);
Chris@10 319 T2I = W[28];
Chris@10 320 T2L = W[29];
Chris@10 321 T2Q = ri[WS(rs, 7)];
Chris@10 322 T2T = ii[WS(rs, 7)];
Chris@10 323 T50 = T2I * T2M;
Chris@10 324 T2K = T2I * T2J;
Chris@10 325 T2P = W[12];
Chris@10 326 T2S = W[13];
Chris@10 327 T51 = FNMS(T2L, T2J, T50);
Chris@10 328 T2N = FMA(T2L, T2M, T2K);
Chris@10 329 T5m = T2P * T2T;
Chris@10 330 T2R = T2P * T2Q;
Chris@10 331 }
Chris@10 332 }
Chris@10 333 {
Chris@10 334 E T52, T76, T2O, T5l, T5n, T2U;
Chris@10 335 T52 = T4Z - T51;
Chris@10 336 T76 = T4Z + T51;
Chris@10 337 T2O = T2H + T2N;
Chris@10 338 T5l = T2H - T2N;
Chris@10 339 T5n = FNMS(T2S, T2Q, T5m);
Chris@10 340 T2U = FMA(T2S, T2T, T2R);
Chris@10 341 {
Chris@10 342 E T5q, T77, T53, T31;
Chris@10 343 T5q = T5n - T5p;
Chris@10 344 T77 = T5n + T5p;
Chris@10 345 T53 = T2U - T30;
Chris@10 346 T31 = T2U + T30;
Chris@10 347 T78 = T76 - T77;
Chris@10 348 T7N = T76 + T77;
Chris@10 349 T54 = T52 - T53;
Chris@10 350 T6f = T52 + T53;
Chris@10 351 T32 = T2O + T31;
Chris@10 352 T7b = T2O - T31;
Chris@10 353 T6c = T5l - T5q;
Chris@10 354 T5r = T5l + T5q;
Chris@10 355 }
Chris@10 356 }
Chris@10 357 }
Chris@10 358 {
Chris@10 359 E T4q, T1O, T4Q, T27, T1X, T20, T1Z, T4s, T1U, T4N, T1Y;
Chris@10 360 {
Chris@10 361 E T23, T26, T25, T4P, T24;
Chris@10 362 {
Chris@10 363 E T1K, T1N, T1J, T1M, T4p, T1L, T22;
Chris@10 364 T1K = ri[WS(rs, 1)];
Chris@10 365 T1N = ii[WS(rs, 1)];
Chris@10 366 T1J = W[0];
Chris@10 367 T1M = W[1];
Chris@10 368 T23 = ri[WS(rs, 25)];
Chris@10 369 T26 = ii[WS(rs, 25)];
Chris@10 370 T4p = T1J * T1N;
Chris@10 371 T1L = T1J * T1K;
Chris@10 372 T22 = W[48];
Chris@10 373 T25 = W[49];
Chris@10 374 T4q = FNMS(T1M, T1K, T4p);
Chris@10 375 T1O = FMA(T1M, T1N, T1L);
Chris@10 376 T4P = T22 * T26;
Chris@10 377 T24 = T22 * T23;
Chris@10 378 }
Chris@10 379 {
Chris@10 380 E T1Q, T1T, T1P, T1S, T4r, T1R, T1W;
Chris@10 381 T1Q = ri[WS(rs, 17)];
Chris@10 382 T1T = ii[WS(rs, 17)];
Chris@10 383 T4Q = FNMS(T25, T23, T4P);
Chris@10 384 T27 = FMA(T25, T26, T24);
Chris@10 385 T1P = W[32];
Chris@10 386 T1S = W[33];
Chris@10 387 T1X = ri[WS(rs, 9)];
Chris@10 388 T20 = ii[WS(rs, 9)];
Chris@10 389 T4r = T1P * T1T;
Chris@10 390 T1R = T1P * T1Q;
Chris@10 391 T1W = W[16];
Chris@10 392 T1Z = W[17];
Chris@10 393 T4s = FNMS(T1S, T1Q, T4r);
Chris@10 394 T1U = FMA(T1S, T1T, T1R);
Chris@10 395 T4N = T1W * T20;
Chris@10 396 T1Y = T1W * T1X;
Chris@10 397 }
Chris@10 398 }
Chris@10 399 {
Chris@10 400 E T4t, T6V, T1V, T4M, T4O, T21;
Chris@10 401 T4t = T4q - T4s;
Chris@10 402 T6V = T4q + T4s;
Chris@10 403 T1V = T1O + T1U;
Chris@10 404 T4M = T1O - T1U;
Chris@10 405 T4O = FNMS(T1Z, T1X, T4N);
Chris@10 406 T21 = FMA(T1Z, T20, T1Y);
Chris@10 407 {
Chris@10 408 E T4R, T6W, T4u, T28;
Chris@10 409 T4R = T4O - T4Q;
Chris@10 410 T6W = T4O + T4Q;
Chris@10 411 T4u = T21 - T27;
Chris@10 412 T28 = T21 + T27;
Chris@10 413 T6X = T6V - T6W;
Chris@10 414 T7I = T6V + T6W;
Chris@10 415 T4v = T4t - T4u;
Chris@10 416 T68 = T4t + T4u;
Chris@10 417 T29 = T1V + T28;
Chris@10 418 T70 = T1V - T28;
Chris@10 419 T65 = T4M - T4R;
Chris@10 420 T4S = T4M + T4R;
Chris@10 421 }
Chris@10 422 }
Chris@10 423 }
Chris@10 424 {
Chris@10 425 E T56, T38, T5g, T3r, T3h, T3k, T3j, T58, T3e, T5d, T3i;
Chris@10 426 {
Chris@10 427 E T3n, T3q, T3p, T5f, T3o;
Chris@10 428 {
Chris@10 429 E T34, T37, T33, T36, T55, T35, T3m;
Chris@10 430 T34 = ri[WS(rs, 3)];
Chris@10 431 T37 = ii[WS(rs, 3)];
Chris@10 432 T33 = W[4];
Chris@10 433 T36 = W[5];
Chris@10 434 T3n = ri[WS(rs, 11)];
Chris@10 435 T3q = ii[WS(rs, 11)];
Chris@10 436 T55 = T33 * T37;
Chris@10 437 T35 = T33 * T34;
Chris@10 438 T3m = W[20];
Chris@10 439 T3p = W[21];
Chris@10 440 T56 = FNMS(T36, T34, T55);
Chris@10 441 T38 = FMA(T36, T37, T35);
Chris@10 442 T5f = T3m * T3q;
Chris@10 443 T3o = T3m * T3n;
Chris@10 444 }
Chris@10 445 {
Chris@10 446 E T3a, T3d, T39, T3c, T57, T3b, T3g;
Chris@10 447 T3a = ri[WS(rs, 19)];
Chris@10 448 T3d = ii[WS(rs, 19)];
Chris@10 449 T5g = FNMS(T3p, T3n, T5f);
Chris@10 450 T3r = FMA(T3p, T3q, T3o);
Chris@10 451 T39 = W[36];
Chris@10 452 T3c = W[37];
Chris@10 453 T3h = ri[WS(rs, 27)];
Chris@10 454 T3k = ii[WS(rs, 27)];
Chris@10 455 T57 = T39 * T3d;
Chris@10 456 T3b = T39 * T3a;
Chris@10 457 T3g = W[52];
Chris@10 458 T3j = W[53];
Chris@10 459 T58 = FNMS(T3c, T3a, T57);
Chris@10 460 T3e = FMA(T3c, T3d, T3b);
Chris@10 461 T5d = T3g * T3k;
Chris@10 462 T3i = T3g * T3h;
Chris@10 463 }
Chris@10 464 }
Chris@10 465 {
Chris@10 466 E T59, T7c, T3f, T5a, T5e, T3l, T7d, T3s;
Chris@10 467 T59 = T56 - T58;
Chris@10 468 T7c = T56 + T58;
Chris@10 469 T3f = T38 + T3e;
Chris@10 470 T5a = T38 - T3e;
Chris@10 471 T5e = FNMS(T3j, T3h, T5d);
Chris@10 472 T3l = FMA(T3j, T3k, T3i);
Chris@10 473 T5h = T5e - T5g;
Chris@10 474 T7d = T5e + T5g;
Chris@10 475 T3s = T3l + T3r;
Chris@10 476 T5c = T3l - T3r;
Chris@10 477 T5s = T5a + T59;
Chris@10 478 T5b = T59 - T5a;
Chris@10 479 T7O = T7c + T7d;
Chris@10 480 T7e = T7c - T7d;
Chris@10 481 T79 = T3s - T3f;
Chris@10 482 T3t = T3f + T3s;
Chris@10 483 }
Chris@10 484 }
Chris@10 485 {
Chris@10 486 E T4x, T2f, T2o, T2r, T4z, T2l, T2n, T2q, T4E, T2p;
Chris@10 487 {
Chris@10 488 E T2u, T2x, T2t, T2w;
Chris@10 489 {
Chris@10 490 E T2b, T2e, T2d, T4w, T2c, T2a;
Chris@10 491 T2b = ri[WS(rs, 5)];
Chris@10 492 T2e = ii[WS(rs, 5)];
Chris@10 493 T2a = W[8];
Chris@10 494 T5t = T5c - T5h;
Chris@10 495 T5i = T5c + T5h;
Chris@10 496 T2d = W[9];
Chris@10 497 T4w = T2a * T2e;
Chris@10 498 T2c = T2a * T2b;
Chris@10 499 T2u = ri[WS(rs, 13)];
Chris@10 500 T2x = ii[WS(rs, 13)];
Chris@10 501 T4x = FNMS(T2d, T2b, T4w);
Chris@10 502 T2f = FMA(T2d, T2e, T2c);
Chris@10 503 T2t = W[24];
Chris@10 504 T2w = W[25];
Chris@10 505 }
Chris@10 506 {
Chris@10 507 E T2h, T2k, T2j, T4y, T2i, T4G, T2v, T2g;
Chris@10 508 T2h = ri[WS(rs, 21)];
Chris@10 509 T2k = ii[WS(rs, 21)];
Chris@10 510 T4G = T2t * T2x;
Chris@10 511 T2v = T2t * T2u;
Chris@10 512 T2g = W[40];
Chris@10 513 T2j = W[41];
Chris@10 514 T4H = FNMS(T2w, T2u, T4G);
Chris@10 515 T2y = FMA(T2w, T2x, T2v);
Chris@10 516 T4y = T2g * T2k;
Chris@10 517 T2i = T2g * T2h;
Chris@10 518 T2o = ri[WS(rs, 29)];
Chris@10 519 T2r = ii[WS(rs, 29)];
Chris@10 520 T4z = FNMS(T2j, T2h, T4y);
Chris@10 521 T2l = FMA(T2j, T2k, T2i);
Chris@10 522 T2n = W[56];
Chris@10 523 T2q = W[57];
Chris@10 524 }
Chris@10 525 }
Chris@10 526 T4A = T4x - T4z;
Chris@10 527 T71 = T4x + T4z;
Chris@10 528 T2m = T2f + T2l;
Chris@10 529 T4B = T2f - T2l;
Chris@10 530 T4E = T2n * T2r;
Chris@10 531 T2p = T2n * T2o;
Chris@10 532 T4F = FNMS(T2q, T2o, T4E);
Chris@10 533 T2s = FMA(T2q, T2r, T2p);
Chris@10 534 }
Chris@10 535 }
Chris@10 536 {
Chris@10 537 E T4T, T4C, T4J, T4U, T7y, T8q, T8p, T7B;
Chris@10 538 {
Chris@10 539 E T6E, T8j, T73, T6Y, T6H, T8k, T8i, T8h;
Chris@10 540 {
Chris@10 541 E T7C, TO, T80, T7Z, T8e, T89, T8d, T1H, T8b, T3v, T7T, T7L, T7U, T7Q, T2A;
Chris@10 542 E T7K, T7P, T7W, T1I;
Chris@10 543 {
Chris@10 544 E T7X, T7Y, T7J, T82, T88;
Chris@10 545 {
Chris@10 546 E Tm, T4I, T72, T4D, T2z, TN;
Chris@10 547 T6E = T8 - Tl;
Chris@10 548 Tm = T8 + Tl;
Chris@10 549 T4T = T4B + T4A;
Chris@10 550 T4C = T4A - T4B;
Chris@10 551 T4I = T4F - T4H;
Chris@10 552 T72 = T4F + T4H;
Chris@10 553 T4D = T2s - T2y;
Chris@10 554 T2z = T2s + T2y;
Chris@10 555 TN = Tz + TM;
Chris@10 556 T8j = TM - Tz;
Chris@10 557 T73 = T71 - T72;
Chris@10 558 T7J = T71 + T72;
Chris@10 559 T4J = T4D + T4I;
Chris@10 560 T4U = T4D - T4I;
Chris@10 561 T2A = T2m + T2z;
Chris@10 562 T6Y = T2z - T2m;
Chris@10 563 T7C = Tm - TN;
Chris@10 564 TO = Tm + TN;
Chris@10 565 }
Chris@10 566 T7K = T7I - T7J;
Chris@10 567 T7X = T7I + T7J;
Chris@10 568 T7Y = T7N + T7O;
Chris@10 569 T7P = T7N - T7O;
Chris@10 570 T6H = T6F - T6G;
Chris@10 571 T82 = T6F + T6G;
Chris@10 572 T88 = T83 + T87;
Chris@10 573 T8k = T87 - T83;
Chris@10 574 T80 = T7X + T7Y;
Chris@10 575 T7Z = T7X - T7Y;
Chris@10 576 T8e = T88 - T82;
Chris@10 577 T89 = T82 + T88;
Chris@10 578 }
Chris@10 579 {
Chris@10 580 E T7H, T7M, T2B, T3u;
Chris@10 581 T7H = T29 - T2A;
Chris@10 582 T2B = T29 + T2A;
Chris@10 583 T3u = T32 + T3t;
Chris@10 584 T7M = T32 - T3t;
Chris@10 585 T8d = T1G - T1f;
Chris@10 586 T1H = T1f + T1G;
Chris@10 587 T8b = T3u - T2B;
Chris@10 588 T3v = T2B + T3u;
Chris@10 589 T7T = T7K - T7H;
Chris@10 590 T7L = T7H + T7K;
Chris@10 591 T7U = T7M + T7P;
Chris@10 592 T7Q = T7M - T7P;
Chris@10 593 }
Chris@10 594 T7W = TO - T1H;
Chris@10 595 T1I = TO + T1H;
Chris@10 596 {
Chris@10 597 E T7S, T8f, T8g, T7V;
Chris@10 598 {
Chris@10 599 E T7R, T8c, T8a, T7G, T81, T7F;
Chris@10 600 T8i = T7Q - T7L;
Chris@10 601 T7R = T7L + T7Q;
Chris@10 602 T81 = T7D + T7E;
Chris@10 603 T7F = T7D - T7E;
Chris@10 604 ri[0] = T1I + T3v;
Chris@10 605 ri[WS(rs, 16)] = T1I - T3v;
Chris@10 606 ri[WS(rs, 8)] = T7W + T7Z;
Chris@10 607 ri[WS(rs, 24)] = T7W - T7Z;
Chris@10 608 T8c = T89 - T81;
Chris@10 609 T8a = T81 + T89;
Chris@10 610 T7G = T7C + T7F;
Chris@10 611 T7S = T7C - T7F;
Chris@10 612 T8h = T8e - T8d;
Chris@10 613 T8f = T8d + T8e;
Chris@10 614 ii[WS(rs, 24)] = T8c - T8b;
Chris@10 615 ii[WS(rs, 8)] = T8b + T8c;
Chris@10 616 ii[WS(rs, 16)] = T8a - T80;
Chris@10 617 ii[0] = T80 + T8a;
Chris@10 618 ri[WS(rs, 4)] = FMA(KP707106781, T7R, T7G);
Chris@10 619 ri[WS(rs, 20)] = FNMS(KP707106781, T7R, T7G);
Chris@10 620 T8g = T7T + T7U;
Chris@10 621 T7V = T7T - T7U;
Chris@10 622 }
Chris@10 623 ii[WS(rs, 20)] = FNMS(KP707106781, T8g, T8f);
Chris@10 624 ii[WS(rs, 4)] = FMA(KP707106781, T8g, T8f);
Chris@10 625 ri[WS(rs, 12)] = FMA(KP707106781, T7V, T7S);
Chris@10 626 ri[WS(rs, 28)] = FNMS(KP707106781, T7V, T7S);
Chris@10 627 }
Chris@10 628 }
Chris@10 629 {
Chris@10 630 E T7f, T7m, T6I, T7a, T7A, T7w, T8r, T8l, T8m, T6T, T7j, T75, T8s, T7p, T7z;
Chris@10 631 E T7t;
Chris@10 632 {
Chris@10 633 E T7n, T6N, T6S, T7o, T7u, T7v;
Chris@10 634 T7f = T7b - T7e;
Chris@10 635 T7u = T7b + T7e;
Chris@10 636 ii[WS(rs, 28)] = FNMS(KP707106781, T8i, T8h);
Chris@10 637 ii[WS(rs, 12)] = FMA(KP707106781, T8i, T8h);
Chris@10 638 T7m = T6E + T6H;
Chris@10 639 T6I = T6E - T6H;
Chris@10 640 T7v = T78 + T79;
Chris@10 641 T7a = T78 - T79;
Chris@10 642 T7n = T6M + T6L;
Chris@10 643 T6N = T6L - T6M;
Chris@10 644 T7A = FMA(KP414213562, T7u, T7v);
Chris@10 645 T7w = FNMS(KP414213562, T7v, T7u);
Chris@10 646 T8r = T8k - T8j;
Chris@10 647 T8l = T8j + T8k;
Chris@10 648 T6S = T6O + T6R;
Chris@10 649 T7o = T6O - T6R;
Chris@10 650 {
Chris@10 651 E T7s, T7r, T6Z, T74;
Chris@10 652 T7s = T6X + T6Y;
Chris@10 653 T6Z = T6X - T6Y;
Chris@10 654 T74 = T70 - T73;
Chris@10 655 T7r = T70 + T73;
Chris@10 656 T8m = T6N + T6S;
Chris@10 657 T6T = T6N - T6S;
Chris@10 658 T7j = FNMS(KP414213562, T6Z, T74);
Chris@10 659 T75 = FMA(KP414213562, T74, T6Z);
Chris@10 660 T8s = T7o - T7n;
Chris@10 661 T7p = T7n + T7o;
Chris@10 662 T7z = FNMS(KP414213562, T7r, T7s);
Chris@10 663 T7t = FMA(KP414213562, T7s, T7r);
Chris@10 664 }
Chris@10 665 }
Chris@10 666 {
Chris@10 667 E T7i, T6U, T8t, T8v, T7k, T7g;
Chris@10 668 T7i = FNMS(KP707106781, T6T, T6I);
Chris@10 669 T6U = FMA(KP707106781, T6T, T6I);
Chris@10 670 T8t = FMA(KP707106781, T8s, T8r);
Chris@10 671 T8v = FNMS(KP707106781, T8s, T8r);
Chris@10 672 T7k = FMA(KP414213562, T7a, T7f);
Chris@10 673 T7g = FNMS(KP414213562, T7f, T7a);
Chris@10 674 {
Chris@10 675 E T7q, T7x, T8n, T8o;
Chris@10 676 T7y = FNMS(KP707106781, T7p, T7m);
Chris@10 677 T7q = FMA(KP707106781, T7p, T7m);
Chris@10 678 {
Chris@10 679 E T7l, T8u, T8w, T7h;
Chris@10 680 T7l = T7j + T7k;
Chris@10 681 T8u = T7k - T7j;
Chris@10 682 T8w = T75 + T7g;
Chris@10 683 T7h = T75 - T7g;
Chris@10 684 ri[WS(rs, 30)] = FMA(KP923879532, T7l, T7i);
Chris@10 685 ri[WS(rs, 14)] = FNMS(KP923879532, T7l, T7i);
Chris@10 686 ii[WS(rs, 22)] = FNMS(KP923879532, T8u, T8t);
Chris@10 687 ii[WS(rs, 6)] = FMA(KP923879532, T8u, T8t);
Chris@10 688 ii[WS(rs, 30)] = FMA(KP923879532, T8w, T8v);
Chris@10 689 ii[WS(rs, 14)] = FNMS(KP923879532, T8w, T8v);
Chris@10 690 ri[WS(rs, 6)] = FMA(KP923879532, T7h, T6U);
Chris@10 691 ri[WS(rs, 22)] = FNMS(KP923879532, T7h, T6U);
Chris@10 692 T7x = T7t + T7w;
Chris@10 693 T8q = T7w - T7t;
Chris@10 694 }
Chris@10 695 T8p = FNMS(KP707106781, T8m, T8l);
Chris@10 696 T8n = FMA(KP707106781, T8m, T8l);
Chris@10 697 T8o = T7z + T7A;
Chris@10 698 T7B = T7z - T7A;
Chris@10 699 ri[WS(rs, 2)] = FMA(KP923879532, T7x, T7q);
Chris@10 700 ri[WS(rs, 18)] = FNMS(KP923879532, T7x, T7q);
Chris@10 701 ii[WS(rs, 18)] = FNMS(KP923879532, T8o, T8n);
Chris@10 702 ii[WS(rs, 2)] = FMA(KP923879532, T8o, T8n);
Chris@10 703 }
Chris@10 704 }
Chris@10 705 }
Chris@10 706 }
Chris@10 707 {
Chris@10 708 E T5S, T8O, T8N, T5V, T6d, T6g, T66, T69, T8G, T8F;
Chris@10 709 {
Chris@10 710 E T5C, T3S, T8C, T4n, T8H, T8B, T8I, T5F, T5k, T5L, T5u, T4K, T4V;
Chris@10 711 {
Chris@10 712 E T5D, T5E, T8z, T8A, T5j;
Chris@10 713 {
Chris@10 714 E T3C, T3R, T47, T4m;
Chris@10 715 T5S = T3w - T3B;
Chris@10 716 T3C = T3w + T3B;
Chris@10 717 ri[WS(rs, 10)] = FMA(KP923879532, T7B, T7y);
Chris@10 718 ri[WS(rs, 26)] = FNMS(KP923879532, T7B, T7y);
Chris@10 719 ii[WS(rs, 26)] = FNMS(KP923879532, T8q, T8p);
Chris@10 720 ii[WS(rs, 10)] = FMA(KP923879532, T8q, T8p);
Chris@10 721 T3R = T3J + T3Q;
Chris@10 722 T8O = T3Q - T3J;
Chris@10 723 T5D = FMA(KP414213562, T3Z, T46);
Chris@10 724 T47 = FNMS(KP414213562, T46, T3Z);
Chris@10 725 T4m = FMA(KP414213562, T4l, T4e);
Chris@10 726 T5E = FNMS(KP414213562, T4e, T4l);
Chris@10 727 T8N = T8y + T8x;
Chris@10 728 T8z = T8x - T8y;
Chris@10 729 T5C = FMA(KP707106781, T3R, T3C);
Chris@10 730 T3S = FNMS(KP707106781, T3R, T3C);
Chris@10 731 T8C = T47 + T4m;
Chris@10 732 T4n = T47 - T4m;
Chris@10 733 T8A = T5T + T5U;
Chris@10 734 T5V = T5T - T5U;
Chris@10 735 }
Chris@10 736 T6d = T5i - T5b;
Chris@10 737 T5j = T5b + T5i;
Chris@10 738 T8H = FNMS(KP707106781, T8A, T8z);
Chris@10 739 T8B = FMA(KP707106781, T8A, T8z);
Chris@10 740 T8I = T5E - T5D;
Chris@10 741 T5F = T5D + T5E;
Chris@10 742 T5k = FNMS(KP707106781, T5j, T54);
Chris@10 743 T5L = FMA(KP707106781, T5j, T54);
Chris@10 744 T5u = T5s + T5t;
Chris@10 745 T6g = T5s - T5t;
Chris@10 746 T66 = T4J - T4C;
Chris@10 747 T4K = T4C + T4J;
Chris@10 748 T4V = T4T + T4U;
Chris@10 749 T69 = T4T - T4U;
Chris@10 750 }
Chris@10 751 {
Chris@10 752 E T5M, T5Q, T5J, T5P, T8L, T8M;
Chris@10 753 {
Chris@10 754 E T5y, T4o, T5A, T5w, T5z, T4X, T8J, T5K, T5v, T8K, T5B, T5x;
Chris@10 755 T5y = FNMS(KP923879532, T4n, T3S);
Chris@10 756 T4o = FMA(KP923879532, T4n, T3S);
Chris@10 757 T5K = FMA(KP707106781, T5u, T5r);
Chris@10 758 T5v = FNMS(KP707106781, T5u, T5r);
Chris@10 759 {
Chris@10 760 E T5I, T4L, T5H, T4W;
Chris@10 761 T5I = FMA(KP707106781, T4K, T4v);
Chris@10 762 T4L = FNMS(KP707106781, T4K, T4v);
Chris@10 763 T5H = FMA(KP707106781, T4V, T4S);
Chris@10 764 T4W = FNMS(KP707106781, T4V, T4S);
Chris@10 765 T5M = FNMS(KP198912367, T5L, T5K);
Chris@10 766 T5Q = FMA(KP198912367, T5K, T5L);
Chris@10 767 T5A = FMA(KP668178637, T5k, T5v);
Chris@10 768 T5w = FNMS(KP668178637, T5v, T5k);
Chris@10 769 T5J = FMA(KP198912367, T5I, T5H);
Chris@10 770 T5P = FNMS(KP198912367, T5H, T5I);
Chris@10 771 T5z = FNMS(KP668178637, T4L, T4W);
Chris@10 772 T4X = FMA(KP668178637, T4W, T4L);
Chris@10 773 }
Chris@10 774 T8J = FMA(KP923879532, T8I, T8H);
Chris@10 775 T8L = FNMS(KP923879532, T8I, T8H);
Chris@10 776 T8K = T5A - T5z;
Chris@10 777 T5B = T5z + T5A;
Chris@10 778 T8M = T4X + T5w;
Chris@10 779 T5x = T4X - T5w;
Chris@10 780 ii[WS(rs, 21)] = FNMS(KP831469612, T8K, T8J);
Chris@10 781 ii[WS(rs, 5)] = FMA(KP831469612, T8K, T8J);
Chris@10 782 ri[WS(rs, 5)] = FMA(KP831469612, T5x, T4o);
Chris@10 783 ri[WS(rs, 21)] = FNMS(KP831469612, T5x, T4o);
Chris@10 784 ri[WS(rs, 29)] = FMA(KP831469612, T5B, T5y);
Chris@10 785 ri[WS(rs, 13)] = FNMS(KP831469612, T5B, T5y);
Chris@10 786 }
Chris@10 787 {
Chris@10 788 E T5O, T8D, T8E, T5R, T5G, T5N;
Chris@10 789 T5O = FNMS(KP923879532, T5F, T5C);
Chris@10 790 T5G = FMA(KP923879532, T5F, T5C);
Chris@10 791 T5N = T5J + T5M;
Chris@10 792 T8G = T5M - T5J;
Chris@10 793 T8F = FNMS(KP923879532, T8C, T8B);
Chris@10 794 T8D = FMA(KP923879532, T8C, T8B);
Chris@10 795 ii[WS(rs, 29)] = FMA(KP831469612, T8M, T8L);
Chris@10 796 ii[WS(rs, 13)] = FNMS(KP831469612, T8M, T8L);
Chris@10 797 ri[WS(rs, 1)] = FMA(KP980785280, T5N, T5G);
Chris@10 798 ri[WS(rs, 17)] = FNMS(KP980785280, T5N, T5G);
Chris@10 799 T8E = T5P + T5Q;
Chris@10 800 T5R = T5P - T5Q;
Chris@10 801 ii[WS(rs, 17)] = FNMS(KP980785280, T8E, T8D);
Chris@10 802 ii[WS(rs, 1)] = FMA(KP980785280, T8E, T8D);
Chris@10 803 ri[WS(rs, 9)] = FMA(KP980785280, T5R, T5O);
Chris@10 804 ri[WS(rs, 25)] = FNMS(KP980785280, T5R, T5O);
Chris@10 805 }
Chris@10 806 }
Chris@10 807 }
Chris@10 808 {
Chris@10 809 E T6o, T5W, T8W, T63, T8V, T8P, T8Q, T6r, T67, T6u, T6y, T6C, T6m, T6i;
Chris@10 810 {
Chris@10 811 E T6p, T5Z, T62, T6q;
Chris@10 812 T6p = FNMS(KP414213562, T5X, T5Y);
Chris@10 813 T5Z = FMA(KP414213562, T5Y, T5X);
Chris@10 814 ii[WS(rs, 25)] = FNMS(KP980785280, T8G, T8F);
Chris@10 815 ii[WS(rs, 9)] = FMA(KP980785280, T8G, T8F);
Chris@10 816 T6o = FNMS(KP707106781, T5V, T5S);
Chris@10 817 T5W = FMA(KP707106781, T5V, T5S);
Chris@10 818 T62 = FNMS(KP414213562, T61, T60);
Chris@10 819 T6q = FMA(KP414213562, T60, T61);
Chris@10 820 T8W = T5Z + T62;
Chris@10 821 T63 = T5Z - T62;
Chris@10 822 T8V = FNMS(KP707106781, T8O, T8N);
Chris@10 823 T8P = FMA(KP707106781, T8O, T8N);
Chris@10 824 {
Chris@10 825 E T6x, T6e, T6w, T6h;
Chris@10 826 T8Q = T6q - T6p;
Chris@10 827 T6r = T6p + T6q;
Chris@10 828 T6x = FMA(KP707106781, T6d, T6c);
Chris@10 829 T6e = FNMS(KP707106781, T6d, T6c);
Chris@10 830 T6w = FMA(KP707106781, T6g, T6f);
Chris@10 831 T6h = FNMS(KP707106781, T6g, T6f);
Chris@10 832 T67 = FNMS(KP707106781, T66, T65);
Chris@10 833 T6u = FMA(KP707106781, T66, T65);
Chris@10 834 T6y = FNMS(KP198912367, T6x, T6w);
Chris@10 835 T6C = FMA(KP198912367, T6w, T6x);
Chris@10 836 T6m = FMA(KP668178637, T6e, T6h);
Chris@10 837 T6i = FNMS(KP668178637, T6h, T6e);
Chris@10 838 }
Chris@10 839 }
Chris@10 840 {
Chris@10 841 E T6k, T64, T8R, T8T, T6t, T6a;
Chris@10 842 T6k = FNMS(KP923879532, T63, T5W);
Chris@10 843 T64 = FMA(KP923879532, T63, T5W);
Chris@10 844 T8R = FMA(KP923879532, T8Q, T8P);
Chris@10 845 T8T = FNMS(KP923879532, T8Q, T8P);
Chris@10 846 T6t = FMA(KP707106781, T69, T68);
Chris@10 847 T6a = FNMS(KP707106781, T69, T68);
Chris@10 848 {
Chris@10 849 E T6A, T8X, T8Y, T6D;
Chris@10 850 {
Chris@10 851 E T6s, T6B, T6l, T6b, T6z, T6v;
Chris@10 852 T6A = FMA(KP923879532, T6r, T6o);
Chris@10 853 T6s = FNMS(KP923879532, T6r, T6o);
Chris@10 854 T6v = FMA(KP198912367, T6u, T6t);
Chris@10 855 T6B = FNMS(KP198912367, T6t, T6u);
Chris@10 856 T6l = FNMS(KP668178637, T67, T6a);
Chris@10 857 T6b = FMA(KP668178637, T6a, T67);
Chris@10 858 T6z = T6v - T6y;
Chris@10 859 T90 = T6v + T6y;
Chris@10 860 T8Z = FMA(KP923879532, T8W, T8V);
Chris@10 861 T8X = FNMS(KP923879532, T8W, T8V);
Chris@10 862 {
Chris@10 863 E T6n, T8S, T8U, T6j;
Chris@10 864 T6n = T6l - T6m;
Chris@10 865 T8S = T6l + T6m;
Chris@10 866 T8U = T6i - T6b;
Chris@10 867 T6j = T6b + T6i;
Chris@10 868 ri[WS(rs, 7)] = FMA(KP980785280, T6z, T6s);
Chris@10 869 ri[WS(rs, 23)] = FNMS(KP980785280, T6z, T6s);
Chris@10 870 ri[WS(rs, 11)] = FMA(KP831469612, T6n, T6k);
Chris@10 871 ri[WS(rs, 27)] = FNMS(KP831469612, T6n, T6k);
Chris@10 872 ii[WS(rs, 19)] = FNMS(KP831469612, T8S, T8R);
Chris@10 873 ii[WS(rs, 3)] = FMA(KP831469612, T8S, T8R);
Chris@10 874 ii[WS(rs, 27)] = FNMS(KP831469612, T8U, T8T);
Chris@10 875 ii[WS(rs, 11)] = FMA(KP831469612, T8U, T8T);
Chris@10 876 ri[WS(rs, 3)] = FMA(KP831469612, T6j, T64);
Chris@10 877 ri[WS(rs, 19)] = FNMS(KP831469612, T6j, T64);
Chris@10 878 T8Y = T6C - T6B;
Chris@10 879 T6D = T6B + T6C;
Chris@10 880 }
Chris@10 881 }
Chris@10 882 ii[WS(rs, 23)] = FNMS(KP980785280, T8Y, T8X);
Chris@10 883 ii[WS(rs, 7)] = FMA(KP980785280, T8Y, T8X);
Chris@10 884 ri[WS(rs, 31)] = FMA(KP980785280, T6D, T6A);
Chris@10 885 ri[WS(rs, 15)] = FNMS(KP980785280, T6D, T6A);
Chris@10 886 }
Chris@10 887 }
Chris@10 888 }
Chris@10 889 }
Chris@10 890 }
Chris@10 891 }
Chris@10 892 ii[WS(rs, 31)] = FMA(KP980785280, T90, T8Z);
Chris@10 893 ii[WS(rs, 15)] = FNMS(KP980785280, T90, T8Z);
Chris@10 894 }
Chris@10 895 }
Chris@10 896 }
Chris@10 897
Chris@10 898 static const tw_instr twinstr[] = {
Chris@10 899 {TW_FULL, 0, 32},
Chris@10 900 {TW_NEXT, 1, 0}
Chris@10 901 };
Chris@10 902
Chris@10 903 static const ct_desc desc = { 32, "t1_32", twinstr, &GENUS, {236, 62, 198, 0}, 0, 0, 0 };
Chris@10 904
Chris@10 905 void X(codelet_t1_32) (planner *p) {
Chris@10 906 X(kdft_dit_register) (p, t1_32, &desc);
Chris@10 907 }
Chris@10 908 #else /* HAVE_FMA */
Chris@10 909
Chris@10 910 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 32 -name t1_32 -include t.h */
Chris@10 911
Chris@10 912 /*
Chris@10 913 * This function contains 434 FP additions, 208 FP multiplications,
Chris@10 914 * (or, 340 additions, 114 multiplications, 94 fused multiply/add),
Chris@10 915 * 96 stack variables, 7 constants, and 128 memory accesses
Chris@10 916 */
Chris@10 917 #include "t.h"
Chris@10 918
Chris@10 919 static void t1_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 920 {
Chris@10 921 DK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@10 922 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 923 DK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@10 924 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 925 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 926 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 927 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 928 {
Chris@10 929 INT m;
Chris@10 930 for (m = mb, W = W + (mb * 62); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 931 E Tj, T5F, T7C, T7Q, T35, T4T, T78, T7m, T1Q, T61, T5Y, T6J, T3K, T59, T41;
Chris@10 932 E T56, T2B, T67, T6e, T6O, T4b, T5d, T4s, T5g, TG, T7l, T5I, T73, T3a, T4U;
Chris@10 933 E T3f, T4V, T14, T5N, T5M, T6E, T3m, T4Y, T3r, T4Z, T1r, T5P, T5S, T6F, T3x;
Chris@10 934 E T51, T3C, T52, T2d, T5Z, T64, T6K, T3V, T57, T44, T5a, T2Y, T6f, T6a, T6P;
Chris@10 935 E T4m, T5h, T4v, T5e;
Chris@10 936 {
Chris@10 937 E T1, T76, T6, T75, Tc, T32, Th, T33;
Chris@10 938 T1 = ri[0];
Chris@10 939 T76 = ii[0];
Chris@10 940 {
Chris@10 941 E T3, T5, T2, T4;
Chris@10 942 T3 = ri[WS(rs, 16)];
Chris@10 943 T5 = ii[WS(rs, 16)];
Chris@10 944 T2 = W[30];
Chris@10 945 T4 = W[31];
Chris@10 946 T6 = FMA(T2, T3, T4 * T5);
Chris@10 947 T75 = FNMS(T4, T3, T2 * T5);
Chris@10 948 }
Chris@10 949 {
Chris@10 950 E T9, Tb, T8, Ta;
Chris@10 951 T9 = ri[WS(rs, 8)];
Chris@10 952 Tb = ii[WS(rs, 8)];
Chris@10 953 T8 = W[14];
Chris@10 954 Ta = W[15];
Chris@10 955 Tc = FMA(T8, T9, Ta * Tb);
Chris@10 956 T32 = FNMS(Ta, T9, T8 * Tb);
Chris@10 957 }
Chris@10 958 {
Chris@10 959 E Te, Tg, Td, Tf;
Chris@10 960 Te = ri[WS(rs, 24)];
Chris@10 961 Tg = ii[WS(rs, 24)];
Chris@10 962 Td = W[46];
Chris@10 963 Tf = W[47];
Chris@10 964 Th = FMA(Td, Te, Tf * Tg);
Chris@10 965 T33 = FNMS(Tf, Te, Td * Tg);
Chris@10 966 }
Chris@10 967 {
Chris@10 968 E T7, Ti, T7A, T7B;
Chris@10 969 T7 = T1 + T6;
Chris@10 970 Ti = Tc + Th;
Chris@10 971 Tj = T7 + Ti;
Chris@10 972 T5F = T7 - Ti;
Chris@10 973 T7A = T76 - T75;
Chris@10 974 T7B = Tc - Th;
Chris@10 975 T7C = T7A - T7B;
Chris@10 976 T7Q = T7B + T7A;
Chris@10 977 }
Chris@10 978 {
Chris@10 979 E T31, T34, T74, T77;
Chris@10 980 T31 = T1 - T6;
Chris@10 981 T34 = T32 - T33;
Chris@10 982 T35 = T31 - T34;
Chris@10 983 T4T = T31 + T34;
Chris@10 984 T74 = T32 + T33;
Chris@10 985 T77 = T75 + T76;
Chris@10 986 T78 = T74 + T77;
Chris@10 987 T7m = T77 - T74;
Chris@10 988 }
Chris@10 989 }
Chris@10 990 {
Chris@10 991 E T1y, T3G, T1O, T3Z, T1D, T3H, T1J, T3Y;
Chris@10 992 {
Chris@10 993 E T1v, T1x, T1u, T1w;
Chris@10 994 T1v = ri[WS(rs, 1)];
Chris@10 995 T1x = ii[WS(rs, 1)];
Chris@10 996 T1u = W[0];
Chris@10 997 T1w = W[1];
Chris@10 998 T1y = FMA(T1u, T1v, T1w * T1x);
Chris@10 999 T3G = FNMS(T1w, T1v, T1u * T1x);
Chris@10 1000 }
Chris@10 1001 {
Chris@10 1002 E T1L, T1N, T1K, T1M;
Chris@10 1003 T1L = ri[WS(rs, 25)];
Chris@10 1004 T1N = ii[WS(rs, 25)];
Chris@10 1005 T1K = W[48];
Chris@10 1006 T1M = W[49];
Chris@10 1007 T1O = FMA(T1K, T1L, T1M * T1N);
Chris@10 1008 T3Z = FNMS(T1M, T1L, T1K * T1N);
Chris@10 1009 }
Chris@10 1010 {
Chris@10 1011 E T1A, T1C, T1z, T1B;
Chris@10 1012 T1A = ri[WS(rs, 17)];
Chris@10 1013 T1C = ii[WS(rs, 17)];
Chris@10 1014 T1z = W[32];
Chris@10 1015 T1B = W[33];
Chris@10 1016 T1D = FMA(T1z, T1A, T1B * T1C);
Chris@10 1017 T3H = FNMS(T1B, T1A, T1z * T1C);
Chris@10 1018 }
Chris@10 1019 {
Chris@10 1020 E T1G, T1I, T1F, T1H;
Chris@10 1021 T1G = ri[WS(rs, 9)];
Chris@10 1022 T1I = ii[WS(rs, 9)];
Chris@10 1023 T1F = W[16];
Chris@10 1024 T1H = W[17];
Chris@10 1025 T1J = FMA(T1F, T1G, T1H * T1I);
Chris@10 1026 T3Y = FNMS(T1H, T1G, T1F * T1I);
Chris@10 1027 }
Chris@10 1028 {
Chris@10 1029 E T1E, T1P, T5W, T5X;
Chris@10 1030 T1E = T1y + T1D;
Chris@10 1031 T1P = T1J + T1O;
Chris@10 1032 T1Q = T1E + T1P;
Chris@10 1033 T61 = T1E - T1P;
Chris@10 1034 T5W = T3G + T3H;
Chris@10 1035 T5X = T3Y + T3Z;
Chris@10 1036 T5Y = T5W - T5X;
Chris@10 1037 T6J = T5W + T5X;
Chris@10 1038 }
Chris@10 1039 {
Chris@10 1040 E T3I, T3J, T3X, T40;
Chris@10 1041 T3I = T3G - T3H;
Chris@10 1042 T3J = T1J - T1O;
Chris@10 1043 T3K = T3I + T3J;
Chris@10 1044 T59 = T3I - T3J;
Chris@10 1045 T3X = T1y - T1D;
Chris@10 1046 T40 = T3Y - T3Z;
Chris@10 1047 T41 = T3X - T40;
Chris@10 1048 T56 = T3X + T40;
Chris@10 1049 }
Chris@10 1050 }
Chris@10 1051 {
Chris@10 1052 E T2j, T4o, T2z, T49, T2o, T4p, T2u, T48;
Chris@10 1053 {
Chris@10 1054 E T2g, T2i, T2f, T2h;
Chris@10 1055 T2g = ri[WS(rs, 31)];
Chris@10 1056 T2i = ii[WS(rs, 31)];
Chris@10 1057 T2f = W[60];
Chris@10 1058 T2h = W[61];
Chris@10 1059 T2j = FMA(T2f, T2g, T2h * T2i);
Chris@10 1060 T4o = FNMS(T2h, T2g, T2f * T2i);
Chris@10 1061 }
Chris@10 1062 {
Chris@10 1063 E T2w, T2y, T2v, T2x;
Chris@10 1064 T2w = ri[WS(rs, 23)];
Chris@10 1065 T2y = ii[WS(rs, 23)];
Chris@10 1066 T2v = W[44];
Chris@10 1067 T2x = W[45];
Chris@10 1068 T2z = FMA(T2v, T2w, T2x * T2y);
Chris@10 1069 T49 = FNMS(T2x, T2w, T2v * T2y);
Chris@10 1070 }
Chris@10 1071 {
Chris@10 1072 E T2l, T2n, T2k, T2m;
Chris@10 1073 T2l = ri[WS(rs, 15)];
Chris@10 1074 T2n = ii[WS(rs, 15)];
Chris@10 1075 T2k = W[28];
Chris@10 1076 T2m = W[29];
Chris@10 1077 T2o = FMA(T2k, T2l, T2m * T2n);
Chris@10 1078 T4p = FNMS(T2m, T2l, T2k * T2n);
Chris@10 1079 }
Chris@10 1080 {
Chris@10 1081 E T2r, T2t, T2q, T2s;
Chris@10 1082 T2r = ri[WS(rs, 7)];
Chris@10 1083 T2t = ii[WS(rs, 7)];
Chris@10 1084 T2q = W[12];
Chris@10 1085 T2s = W[13];
Chris@10 1086 T2u = FMA(T2q, T2r, T2s * T2t);
Chris@10 1087 T48 = FNMS(T2s, T2r, T2q * T2t);
Chris@10 1088 }
Chris@10 1089 {
Chris@10 1090 E T2p, T2A, T6c, T6d;
Chris@10 1091 T2p = T2j + T2o;
Chris@10 1092 T2A = T2u + T2z;
Chris@10 1093 T2B = T2p + T2A;
Chris@10 1094 T67 = T2p - T2A;
Chris@10 1095 T6c = T4o + T4p;
Chris@10 1096 T6d = T48 + T49;
Chris@10 1097 T6e = T6c - T6d;
Chris@10 1098 T6O = T6c + T6d;
Chris@10 1099 }
Chris@10 1100 {
Chris@10 1101 E T47, T4a, T4q, T4r;
Chris@10 1102 T47 = T2j - T2o;
Chris@10 1103 T4a = T48 - T49;
Chris@10 1104 T4b = T47 - T4a;
Chris@10 1105 T5d = T47 + T4a;
Chris@10 1106 T4q = T4o - T4p;
Chris@10 1107 T4r = T2u - T2z;
Chris@10 1108 T4s = T4q + T4r;
Chris@10 1109 T5g = T4q - T4r;
Chris@10 1110 }
Chris@10 1111 }
Chris@10 1112 {
Chris@10 1113 E To, T36, TE, T3d, Tt, T37, Tz, T3c;
Chris@10 1114 {
Chris@10 1115 E Tl, Tn, Tk, Tm;
Chris@10 1116 Tl = ri[WS(rs, 4)];
Chris@10 1117 Tn = ii[WS(rs, 4)];
Chris@10 1118 Tk = W[6];
Chris@10 1119 Tm = W[7];
Chris@10 1120 To = FMA(Tk, Tl, Tm * Tn);
Chris@10 1121 T36 = FNMS(Tm, Tl, Tk * Tn);
Chris@10 1122 }
Chris@10 1123 {
Chris@10 1124 E TB, TD, TA, TC;
Chris@10 1125 TB = ri[WS(rs, 12)];
Chris@10 1126 TD = ii[WS(rs, 12)];
Chris@10 1127 TA = W[22];
Chris@10 1128 TC = W[23];
Chris@10 1129 TE = FMA(TA, TB, TC * TD);
Chris@10 1130 T3d = FNMS(TC, TB, TA * TD);
Chris@10 1131 }
Chris@10 1132 {
Chris@10 1133 E Tq, Ts, Tp, Tr;
Chris@10 1134 Tq = ri[WS(rs, 20)];
Chris@10 1135 Ts = ii[WS(rs, 20)];
Chris@10 1136 Tp = W[38];
Chris@10 1137 Tr = W[39];
Chris@10 1138 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@10 1139 T37 = FNMS(Tr, Tq, Tp * Ts);
Chris@10 1140 }
Chris@10 1141 {
Chris@10 1142 E Tw, Ty, Tv, Tx;
Chris@10 1143 Tw = ri[WS(rs, 28)];
Chris@10 1144 Ty = ii[WS(rs, 28)];
Chris@10 1145 Tv = W[54];
Chris@10 1146 Tx = W[55];
Chris@10 1147 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@10 1148 T3c = FNMS(Tx, Tw, Tv * Ty);
Chris@10 1149 }
Chris@10 1150 {
Chris@10 1151 E Tu, TF, T5G, T5H;
Chris@10 1152 Tu = To + Tt;
Chris@10 1153 TF = Tz + TE;
Chris@10 1154 TG = Tu + TF;
Chris@10 1155 T7l = TF - Tu;
Chris@10 1156 T5G = T36 + T37;
Chris@10 1157 T5H = T3c + T3d;
Chris@10 1158 T5I = T5G - T5H;
Chris@10 1159 T73 = T5G + T5H;
Chris@10 1160 }
Chris@10 1161 {
Chris@10 1162 E T38, T39, T3b, T3e;
Chris@10 1163 T38 = T36 - T37;
Chris@10 1164 T39 = To - Tt;
Chris@10 1165 T3a = T38 - T39;
Chris@10 1166 T4U = T39 + T38;
Chris@10 1167 T3b = Tz - TE;
Chris@10 1168 T3e = T3c - T3d;
Chris@10 1169 T3f = T3b + T3e;
Chris@10 1170 T4V = T3b - T3e;
Chris@10 1171 }
Chris@10 1172 }
Chris@10 1173 {
Chris@10 1174 E TM, T3i, T12, T3p, TR, T3j, TX, T3o;
Chris@10 1175 {
Chris@10 1176 E TJ, TL, TI, TK;
Chris@10 1177 TJ = ri[WS(rs, 2)];
Chris@10 1178 TL = ii[WS(rs, 2)];
Chris@10 1179 TI = W[2];
Chris@10 1180 TK = W[3];
Chris@10 1181 TM = FMA(TI, TJ, TK * TL);
Chris@10 1182 T3i = FNMS(TK, TJ, TI * TL);
Chris@10 1183 }
Chris@10 1184 {
Chris@10 1185 E TZ, T11, TY, T10;
Chris@10 1186 TZ = ri[WS(rs, 26)];
Chris@10 1187 T11 = ii[WS(rs, 26)];
Chris@10 1188 TY = W[50];
Chris@10 1189 T10 = W[51];
Chris@10 1190 T12 = FMA(TY, TZ, T10 * T11);
Chris@10 1191 T3p = FNMS(T10, TZ, TY * T11);
Chris@10 1192 }
Chris@10 1193 {
Chris@10 1194 E TO, TQ, TN, TP;
Chris@10 1195 TO = ri[WS(rs, 18)];
Chris@10 1196 TQ = ii[WS(rs, 18)];
Chris@10 1197 TN = W[34];
Chris@10 1198 TP = W[35];
Chris@10 1199 TR = FMA(TN, TO, TP * TQ);
Chris@10 1200 T3j = FNMS(TP, TO, TN * TQ);
Chris@10 1201 }
Chris@10 1202 {
Chris@10 1203 E TU, TW, TT, TV;
Chris@10 1204 TU = ri[WS(rs, 10)];
Chris@10 1205 TW = ii[WS(rs, 10)];
Chris@10 1206 TT = W[18];
Chris@10 1207 TV = W[19];
Chris@10 1208 TX = FMA(TT, TU, TV * TW);
Chris@10 1209 T3o = FNMS(TV, TU, TT * TW);
Chris@10 1210 }
Chris@10 1211 {
Chris@10 1212 E TS, T13, T5K, T5L;
Chris@10 1213 TS = TM + TR;
Chris@10 1214 T13 = TX + T12;
Chris@10 1215 T14 = TS + T13;
Chris@10 1216 T5N = TS - T13;
Chris@10 1217 T5K = T3i + T3j;
Chris@10 1218 T5L = T3o + T3p;
Chris@10 1219 T5M = T5K - T5L;
Chris@10 1220 T6E = T5K + T5L;
Chris@10 1221 }
Chris@10 1222 {
Chris@10 1223 E T3k, T3l, T3n, T3q;
Chris@10 1224 T3k = T3i - T3j;
Chris@10 1225 T3l = TX - T12;
Chris@10 1226 T3m = T3k + T3l;
Chris@10 1227 T4Y = T3k - T3l;
Chris@10 1228 T3n = TM - TR;
Chris@10 1229 T3q = T3o - T3p;
Chris@10 1230 T3r = T3n - T3q;
Chris@10 1231 T4Z = T3n + T3q;
Chris@10 1232 }
Chris@10 1233 }
Chris@10 1234 {
Chris@10 1235 E T19, T3t, T1p, T3A, T1e, T3u, T1k, T3z;
Chris@10 1236 {
Chris@10 1237 E T16, T18, T15, T17;
Chris@10 1238 T16 = ri[WS(rs, 30)];
Chris@10 1239 T18 = ii[WS(rs, 30)];
Chris@10 1240 T15 = W[58];
Chris@10 1241 T17 = W[59];
Chris@10 1242 T19 = FMA(T15, T16, T17 * T18);
Chris@10 1243 T3t = FNMS(T17, T16, T15 * T18);
Chris@10 1244 }
Chris@10 1245 {
Chris@10 1246 E T1m, T1o, T1l, T1n;
Chris@10 1247 T1m = ri[WS(rs, 22)];
Chris@10 1248 T1o = ii[WS(rs, 22)];
Chris@10 1249 T1l = W[42];
Chris@10 1250 T1n = W[43];
Chris@10 1251 T1p = FMA(T1l, T1m, T1n * T1o);
Chris@10 1252 T3A = FNMS(T1n, T1m, T1l * T1o);
Chris@10 1253 }
Chris@10 1254 {
Chris@10 1255 E T1b, T1d, T1a, T1c;
Chris@10 1256 T1b = ri[WS(rs, 14)];
Chris@10 1257 T1d = ii[WS(rs, 14)];
Chris@10 1258 T1a = W[26];
Chris@10 1259 T1c = W[27];
Chris@10 1260 T1e = FMA(T1a, T1b, T1c * T1d);
Chris@10 1261 T3u = FNMS(T1c, T1b, T1a * T1d);
Chris@10 1262 }
Chris@10 1263 {
Chris@10 1264 E T1h, T1j, T1g, T1i;
Chris@10 1265 T1h = ri[WS(rs, 6)];
Chris@10 1266 T1j = ii[WS(rs, 6)];
Chris@10 1267 T1g = W[10];
Chris@10 1268 T1i = W[11];
Chris@10 1269 T1k = FMA(T1g, T1h, T1i * T1j);
Chris@10 1270 T3z = FNMS(T1i, T1h, T1g * T1j);
Chris@10 1271 }
Chris@10 1272 {
Chris@10 1273 E T1f, T1q, T5Q, T5R;
Chris@10 1274 T1f = T19 + T1e;
Chris@10 1275 T1q = T1k + T1p;
Chris@10 1276 T1r = T1f + T1q;
Chris@10 1277 T5P = T1f - T1q;
Chris@10 1278 T5Q = T3t + T3u;
Chris@10 1279 T5R = T3z + T3A;
Chris@10 1280 T5S = T5Q - T5R;
Chris@10 1281 T6F = T5Q + T5R;
Chris@10 1282 }
Chris@10 1283 {
Chris@10 1284 E T3v, T3w, T3y, T3B;
Chris@10 1285 T3v = T3t - T3u;
Chris@10 1286 T3w = T1k - T1p;
Chris@10 1287 T3x = T3v + T3w;
Chris@10 1288 T51 = T3v - T3w;
Chris@10 1289 T3y = T19 - T1e;
Chris@10 1290 T3B = T3z - T3A;
Chris@10 1291 T3C = T3y - T3B;
Chris@10 1292 T52 = T3y + T3B;
Chris@10 1293 }
Chris@10 1294 }
Chris@10 1295 {
Chris@10 1296 E T1V, T3R, T20, T3S, T3Q, T3T, T26, T3M, T2b, T3N, T3L, T3O;
Chris@10 1297 {
Chris@10 1298 E T1S, T1U, T1R, T1T;
Chris@10 1299 T1S = ri[WS(rs, 5)];
Chris@10 1300 T1U = ii[WS(rs, 5)];
Chris@10 1301 T1R = W[8];
Chris@10 1302 T1T = W[9];
Chris@10 1303 T1V = FMA(T1R, T1S, T1T * T1U);
Chris@10 1304 T3R = FNMS(T1T, T1S, T1R * T1U);
Chris@10 1305 }
Chris@10 1306 {
Chris@10 1307 E T1X, T1Z, T1W, T1Y;
Chris@10 1308 T1X = ri[WS(rs, 21)];
Chris@10 1309 T1Z = ii[WS(rs, 21)];
Chris@10 1310 T1W = W[40];
Chris@10 1311 T1Y = W[41];
Chris@10 1312 T20 = FMA(T1W, T1X, T1Y * T1Z);
Chris@10 1313 T3S = FNMS(T1Y, T1X, T1W * T1Z);
Chris@10 1314 }
Chris@10 1315 T3Q = T1V - T20;
Chris@10 1316 T3T = T3R - T3S;
Chris@10 1317 {
Chris@10 1318 E T23, T25, T22, T24;
Chris@10 1319 T23 = ri[WS(rs, 29)];
Chris@10 1320 T25 = ii[WS(rs, 29)];
Chris@10 1321 T22 = W[56];
Chris@10 1322 T24 = W[57];
Chris@10 1323 T26 = FMA(T22, T23, T24 * T25);
Chris@10 1324 T3M = FNMS(T24, T23, T22 * T25);
Chris@10 1325 }
Chris@10 1326 {
Chris@10 1327 E T28, T2a, T27, T29;
Chris@10 1328 T28 = ri[WS(rs, 13)];
Chris@10 1329 T2a = ii[WS(rs, 13)];
Chris@10 1330 T27 = W[24];
Chris@10 1331 T29 = W[25];
Chris@10 1332 T2b = FMA(T27, T28, T29 * T2a);
Chris@10 1333 T3N = FNMS(T29, T28, T27 * T2a);
Chris@10 1334 }
Chris@10 1335 T3L = T26 - T2b;
Chris@10 1336 T3O = T3M - T3N;
Chris@10 1337 {
Chris@10 1338 E T21, T2c, T62, T63;
Chris@10 1339 T21 = T1V + T20;
Chris@10 1340 T2c = T26 + T2b;
Chris@10 1341 T2d = T21 + T2c;
Chris@10 1342 T5Z = T2c - T21;
Chris@10 1343 T62 = T3R + T3S;
Chris@10 1344 T63 = T3M + T3N;
Chris@10 1345 T64 = T62 - T63;
Chris@10 1346 T6K = T62 + T63;
Chris@10 1347 }
Chris@10 1348 {
Chris@10 1349 E T3P, T3U, T42, T43;
Chris@10 1350 T3P = T3L - T3O;
Chris@10 1351 T3U = T3Q + T3T;
Chris@10 1352 T3V = KP707106781 * (T3P - T3U);
Chris@10 1353 T57 = KP707106781 * (T3U + T3P);
Chris@10 1354 T42 = T3T - T3Q;
Chris@10 1355 T43 = T3L + T3O;
Chris@10 1356 T44 = KP707106781 * (T42 - T43);
Chris@10 1357 T5a = KP707106781 * (T42 + T43);
Chris@10 1358 }
Chris@10 1359 }
Chris@10 1360 {
Chris@10 1361 E T2G, T4c, T2L, T4d, T4e, T4f, T2R, T4i, T2W, T4j, T4h, T4k;
Chris@10 1362 {
Chris@10 1363 E T2D, T2F, T2C, T2E;
Chris@10 1364 T2D = ri[WS(rs, 3)];
Chris@10 1365 T2F = ii[WS(rs, 3)];
Chris@10 1366 T2C = W[4];
Chris@10 1367 T2E = W[5];
Chris@10 1368 T2G = FMA(T2C, T2D, T2E * T2F);
Chris@10 1369 T4c = FNMS(T2E, T2D, T2C * T2F);
Chris@10 1370 }
Chris@10 1371 {
Chris@10 1372 E T2I, T2K, T2H, T2J;
Chris@10 1373 T2I = ri[WS(rs, 19)];
Chris@10 1374 T2K = ii[WS(rs, 19)];
Chris@10 1375 T2H = W[36];
Chris@10 1376 T2J = W[37];
Chris@10 1377 T2L = FMA(T2H, T2I, T2J * T2K);
Chris@10 1378 T4d = FNMS(T2J, T2I, T2H * T2K);
Chris@10 1379 }
Chris@10 1380 T4e = T4c - T4d;
Chris@10 1381 T4f = T2G - T2L;
Chris@10 1382 {
Chris@10 1383 E T2O, T2Q, T2N, T2P;
Chris@10 1384 T2O = ri[WS(rs, 27)];
Chris@10 1385 T2Q = ii[WS(rs, 27)];
Chris@10 1386 T2N = W[52];
Chris@10 1387 T2P = W[53];
Chris@10 1388 T2R = FMA(T2N, T2O, T2P * T2Q);
Chris@10 1389 T4i = FNMS(T2P, T2O, T2N * T2Q);
Chris@10 1390 }
Chris@10 1391 {
Chris@10 1392 E T2T, T2V, T2S, T2U;
Chris@10 1393 T2T = ri[WS(rs, 11)];
Chris@10 1394 T2V = ii[WS(rs, 11)];
Chris@10 1395 T2S = W[20];
Chris@10 1396 T2U = W[21];
Chris@10 1397 T2W = FMA(T2S, T2T, T2U * T2V);
Chris@10 1398 T4j = FNMS(T2U, T2T, T2S * T2V);
Chris@10 1399 }
Chris@10 1400 T4h = T2R - T2W;
Chris@10 1401 T4k = T4i - T4j;
Chris@10 1402 {
Chris@10 1403 E T2M, T2X, T68, T69;
Chris@10 1404 T2M = T2G + T2L;
Chris@10 1405 T2X = T2R + T2W;
Chris@10 1406 T2Y = T2M + T2X;
Chris@10 1407 T6f = T2X - T2M;
Chris@10 1408 T68 = T4c + T4d;
Chris@10 1409 T69 = T4i + T4j;
Chris@10 1410 T6a = T68 - T69;
Chris@10 1411 T6P = T68 + T69;
Chris@10 1412 }
Chris@10 1413 {
Chris@10 1414 E T4g, T4l, T4t, T4u;
Chris@10 1415 T4g = T4e - T4f;
Chris@10 1416 T4l = T4h + T4k;
Chris@10 1417 T4m = KP707106781 * (T4g - T4l);
Chris@10 1418 T5h = KP707106781 * (T4g + T4l);
Chris@10 1419 T4t = T4h - T4k;
Chris@10 1420 T4u = T4f + T4e;
Chris@10 1421 T4v = KP707106781 * (T4t - T4u);
Chris@10 1422 T5e = KP707106781 * (T4u + T4t);
Chris@10 1423 }
Chris@10 1424 }
Chris@10 1425 {
Chris@10 1426 E T1t, T6X, T7a, T7c, T30, T7b, T70, T71;
Chris@10 1427 {
Chris@10 1428 E TH, T1s, T72, T79;
Chris@10 1429 TH = Tj + TG;
Chris@10 1430 T1s = T14 + T1r;
Chris@10 1431 T1t = TH + T1s;
Chris@10 1432 T6X = TH - T1s;
Chris@10 1433 T72 = T6E + T6F;
Chris@10 1434 T79 = T73 + T78;
Chris@10 1435 T7a = T72 + T79;
Chris@10 1436 T7c = T79 - T72;
Chris@10 1437 }
Chris@10 1438 {
Chris@10 1439 E T2e, T2Z, T6Y, T6Z;
Chris@10 1440 T2e = T1Q + T2d;
Chris@10 1441 T2Z = T2B + T2Y;
Chris@10 1442 T30 = T2e + T2Z;
Chris@10 1443 T7b = T2Z - T2e;
Chris@10 1444 T6Y = T6J + T6K;
Chris@10 1445 T6Z = T6O + T6P;
Chris@10 1446 T70 = T6Y - T6Z;
Chris@10 1447 T71 = T6Y + T6Z;
Chris@10 1448 }
Chris@10 1449 ri[WS(rs, 16)] = T1t - T30;
Chris@10 1450 ii[WS(rs, 16)] = T7a - T71;
Chris@10 1451 ri[0] = T1t + T30;
Chris@10 1452 ii[0] = T71 + T7a;
Chris@10 1453 ri[WS(rs, 24)] = T6X - T70;
Chris@10 1454 ii[WS(rs, 24)] = T7c - T7b;
Chris@10 1455 ri[WS(rs, 8)] = T6X + T70;
Chris@10 1456 ii[WS(rs, 8)] = T7b + T7c;
Chris@10 1457 }
Chris@10 1458 {
Chris@10 1459 E T6H, T6T, T7g, T7i, T6M, T6U, T6R, T6V;
Chris@10 1460 {
Chris@10 1461 E T6D, T6G, T7e, T7f;
Chris@10 1462 T6D = Tj - TG;
Chris@10 1463 T6G = T6E - T6F;
Chris@10 1464 T6H = T6D + T6G;
Chris@10 1465 T6T = T6D - T6G;
Chris@10 1466 T7e = T1r - T14;
Chris@10 1467 T7f = T78 - T73;
Chris@10 1468 T7g = T7e + T7f;
Chris@10 1469 T7i = T7f - T7e;
Chris@10 1470 }
Chris@10 1471 {
Chris@10 1472 E T6I, T6L, T6N, T6Q;
Chris@10 1473 T6I = T1Q - T2d;
Chris@10 1474 T6L = T6J - T6K;
Chris@10 1475 T6M = T6I + T6L;
Chris@10 1476 T6U = T6L - T6I;
Chris@10 1477 T6N = T2B - T2Y;
Chris@10 1478 T6Q = T6O - T6P;
Chris@10 1479 T6R = T6N - T6Q;
Chris@10 1480 T6V = T6N + T6Q;
Chris@10 1481 }
Chris@10 1482 {
Chris@10 1483 E T6S, T7d, T6W, T7h;
Chris@10 1484 T6S = KP707106781 * (T6M + T6R);
Chris@10 1485 ri[WS(rs, 20)] = T6H - T6S;
Chris@10 1486 ri[WS(rs, 4)] = T6H + T6S;
Chris@10 1487 T7d = KP707106781 * (T6U + T6V);
Chris@10 1488 ii[WS(rs, 4)] = T7d + T7g;
Chris@10 1489 ii[WS(rs, 20)] = T7g - T7d;
Chris@10 1490 T6W = KP707106781 * (T6U - T6V);
Chris@10 1491 ri[WS(rs, 28)] = T6T - T6W;
Chris@10 1492 ri[WS(rs, 12)] = T6T + T6W;
Chris@10 1493 T7h = KP707106781 * (T6R - T6M);
Chris@10 1494 ii[WS(rs, 12)] = T7h + T7i;
Chris@10 1495 ii[WS(rs, 28)] = T7i - T7h;
Chris@10 1496 }
Chris@10 1497 }
Chris@10 1498 {
Chris@10 1499 E T5J, T7n, T7t, T6n, T5U, T7k, T6x, T6B, T6q, T7s, T66, T6k, T6u, T6A, T6h;
Chris@10 1500 E T6l;
Chris@10 1501 {
Chris@10 1502 E T5O, T5T, T60, T65;
Chris@10 1503 T5J = T5F - T5I;
Chris@10 1504 T7n = T7l + T7m;
Chris@10 1505 T7t = T7m - T7l;
Chris@10 1506 T6n = T5F + T5I;
Chris@10 1507 T5O = T5M - T5N;
Chris@10 1508 T5T = T5P + T5S;
Chris@10 1509 T5U = KP707106781 * (T5O - T5T);
Chris@10 1510 T7k = KP707106781 * (T5O + T5T);
Chris@10 1511 {
Chris@10 1512 E T6v, T6w, T6o, T6p;
Chris@10 1513 T6v = T67 + T6a;
Chris@10 1514 T6w = T6e + T6f;
Chris@10 1515 T6x = FNMS(KP382683432, T6w, KP923879532 * T6v);
Chris@10 1516 T6B = FMA(KP923879532, T6w, KP382683432 * T6v);
Chris@10 1517 T6o = T5N + T5M;
Chris@10 1518 T6p = T5P - T5S;
Chris@10 1519 T6q = KP707106781 * (T6o + T6p);
Chris@10 1520 T7s = KP707106781 * (T6p - T6o);
Chris@10 1521 }
Chris@10 1522 T60 = T5Y - T5Z;
Chris@10 1523 T65 = T61 - T64;
Chris@10 1524 T66 = FMA(KP923879532, T60, KP382683432 * T65);
Chris@10 1525 T6k = FNMS(KP923879532, T65, KP382683432 * T60);
Chris@10 1526 {
Chris@10 1527 E T6s, T6t, T6b, T6g;
Chris@10 1528 T6s = T5Y + T5Z;
Chris@10 1529 T6t = T61 + T64;
Chris@10 1530 T6u = FMA(KP382683432, T6s, KP923879532 * T6t);
Chris@10 1531 T6A = FNMS(KP382683432, T6t, KP923879532 * T6s);
Chris@10 1532 T6b = T67 - T6a;
Chris@10 1533 T6g = T6e - T6f;
Chris@10 1534 T6h = FNMS(KP923879532, T6g, KP382683432 * T6b);
Chris@10 1535 T6l = FMA(KP382683432, T6g, KP923879532 * T6b);
Chris@10 1536 }
Chris@10 1537 }
Chris@10 1538 {
Chris@10 1539 E T5V, T6i, T7r, T7u;
Chris@10 1540 T5V = T5J + T5U;
Chris@10 1541 T6i = T66 + T6h;
Chris@10 1542 ri[WS(rs, 22)] = T5V - T6i;
Chris@10 1543 ri[WS(rs, 6)] = T5V + T6i;
Chris@10 1544 T7r = T6k + T6l;
Chris@10 1545 T7u = T7s + T7t;
Chris@10 1546 ii[WS(rs, 6)] = T7r + T7u;
Chris@10 1547 ii[WS(rs, 22)] = T7u - T7r;
Chris@10 1548 }
Chris@10 1549 {
Chris@10 1550 E T6j, T6m, T7v, T7w;
Chris@10 1551 T6j = T5J - T5U;
Chris@10 1552 T6m = T6k - T6l;
Chris@10 1553 ri[WS(rs, 30)] = T6j - T6m;
Chris@10 1554 ri[WS(rs, 14)] = T6j + T6m;
Chris@10 1555 T7v = T6h - T66;
Chris@10 1556 T7w = T7t - T7s;
Chris@10 1557 ii[WS(rs, 14)] = T7v + T7w;
Chris@10 1558 ii[WS(rs, 30)] = T7w - T7v;
Chris@10 1559 }
Chris@10 1560 {
Chris@10 1561 E T6r, T6y, T7j, T7o;
Chris@10 1562 T6r = T6n + T6q;
Chris@10 1563 T6y = T6u + T6x;
Chris@10 1564 ri[WS(rs, 18)] = T6r - T6y;
Chris@10 1565 ri[WS(rs, 2)] = T6r + T6y;
Chris@10 1566 T7j = T6A + T6B;
Chris@10 1567 T7o = T7k + T7n;
Chris@10 1568 ii[WS(rs, 2)] = T7j + T7o;
Chris@10 1569 ii[WS(rs, 18)] = T7o - T7j;
Chris@10 1570 }
Chris@10 1571 {
Chris@10 1572 E T6z, T6C, T7p, T7q;
Chris@10 1573 T6z = T6n - T6q;
Chris@10 1574 T6C = T6A - T6B;
Chris@10 1575 ri[WS(rs, 26)] = T6z - T6C;
Chris@10 1576 ri[WS(rs, 10)] = T6z + T6C;
Chris@10 1577 T7p = T6x - T6u;
Chris@10 1578 T7q = T7n - T7k;
Chris@10 1579 ii[WS(rs, 10)] = T7p + T7q;
Chris@10 1580 ii[WS(rs, 26)] = T7q - T7p;
Chris@10 1581 }
Chris@10 1582 }
Chris@10 1583 {
Chris@10 1584 E T3h, T4D, T7R, T7X, T3E, T7O, T4N, T4R, T46, T4A, T4G, T7W, T4K, T4Q, T4x;
Chris@10 1585 E T4B, T3g, T7P;
Chris@10 1586 T3g = KP707106781 * (T3a - T3f);
Chris@10 1587 T3h = T35 - T3g;
Chris@10 1588 T4D = T35 + T3g;
Chris@10 1589 T7P = KP707106781 * (T4V - T4U);
Chris@10 1590 T7R = T7P + T7Q;
Chris@10 1591 T7X = T7Q - T7P;
Chris@10 1592 {
Chris@10 1593 E T3s, T3D, T4L, T4M;
Chris@10 1594 T3s = FNMS(KP923879532, T3r, KP382683432 * T3m);
Chris@10 1595 T3D = FMA(KP382683432, T3x, KP923879532 * T3C);
Chris@10 1596 T3E = T3s - T3D;
Chris@10 1597 T7O = T3s + T3D;
Chris@10 1598 T4L = T4b + T4m;
Chris@10 1599 T4M = T4s + T4v;
Chris@10 1600 T4N = FNMS(KP555570233, T4M, KP831469612 * T4L);
Chris@10 1601 T4R = FMA(KP831469612, T4M, KP555570233 * T4L);
Chris@10 1602 }
Chris@10 1603 {
Chris@10 1604 E T3W, T45, T4E, T4F;
Chris@10 1605 T3W = T3K - T3V;
Chris@10 1606 T45 = T41 - T44;
Chris@10 1607 T46 = FMA(KP980785280, T3W, KP195090322 * T45);
Chris@10 1608 T4A = FNMS(KP980785280, T45, KP195090322 * T3W);
Chris@10 1609 T4E = FMA(KP923879532, T3m, KP382683432 * T3r);
Chris@10 1610 T4F = FNMS(KP923879532, T3x, KP382683432 * T3C);
Chris@10 1611 T4G = T4E + T4F;
Chris@10 1612 T7W = T4F - T4E;
Chris@10 1613 }
Chris@10 1614 {
Chris@10 1615 E T4I, T4J, T4n, T4w;
Chris@10 1616 T4I = T3K + T3V;
Chris@10 1617 T4J = T41 + T44;
Chris@10 1618 T4K = FMA(KP555570233, T4I, KP831469612 * T4J);
Chris@10 1619 T4Q = FNMS(KP555570233, T4J, KP831469612 * T4I);
Chris@10 1620 T4n = T4b - T4m;
Chris@10 1621 T4w = T4s - T4v;
Chris@10 1622 T4x = FNMS(KP980785280, T4w, KP195090322 * T4n);
Chris@10 1623 T4B = FMA(KP195090322, T4w, KP980785280 * T4n);
Chris@10 1624 }
Chris@10 1625 {
Chris@10 1626 E T3F, T4y, T7V, T7Y;
Chris@10 1627 T3F = T3h + T3E;
Chris@10 1628 T4y = T46 + T4x;
Chris@10 1629 ri[WS(rs, 23)] = T3F - T4y;
Chris@10 1630 ri[WS(rs, 7)] = T3F + T4y;
Chris@10 1631 T7V = T4A + T4B;
Chris@10 1632 T7Y = T7W + T7X;
Chris@10 1633 ii[WS(rs, 7)] = T7V + T7Y;
Chris@10 1634 ii[WS(rs, 23)] = T7Y - T7V;
Chris@10 1635 }
Chris@10 1636 {
Chris@10 1637 E T4z, T4C, T7Z, T80;
Chris@10 1638 T4z = T3h - T3E;
Chris@10 1639 T4C = T4A - T4B;
Chris@10 1640 ri[WS(rs, 31)] = T4z - T4C;
Chris@10 1641 ri[WS(rs, 15)] = T4z + T4C;
Chris@10 1642 T7Z = T4x - T46;
Chris@10 1643 T80 = T7X - T7W;
Chris@10 1644 ii[WS(rs, 15)] = T7Z + T80;
Chris@10 1645 ii[WS(rs, 31)] = T80 - T7Z;
Chris@10 1646 }
Chris@10 1647 {
Chris@10 1648 E T4H, T4O, T7N, T7S;
Chris@10 1649 T4H = T4D + T4G;
Chris@10 1650 T4O = T4K + T4N;
Chris@10 1651 ri[WS(rs, 19)] = T4H - T4O;
Chris@10 1652 ri[WS(rs, 3)] = T4H + T4O;
Chris@10 1653 T7N = T4Q + T4R;
Chris@10 1654 T7S = T7O + T7R;
Chris@10 1655 ii[WS(rs, 3)] = T7N + T7S;
Chris@10 1656 ii[WS(rs, 19)] = T7S - T7N;
Chris@10 1657 }
Chris@10 1658 {
Chris@10 1659 E T4P, T4S, T7T, T7U;
Chris@10 1660 T4P = T4D - T4G;
Chris@10 1661 T4S = T4Q - T4R;
Chris@10 1662 ri[WS(rs, 27)] = T4P - T4S;
Chris@10 1663 ri[WS(rs, 11)] = T4P + T4S;
Chris@10 1664 T7T = T4N - T4K;
Chris@10 1665 T7U = T7R - T7O;
Chris@10 1666 ii[WS(rs, 11)] = T7T + T7U;
Chris@10 1667 ii[WS(rs, 27)] = T7U - T7T;
Chris@10 1668 }
Chris@10 1669 }
Chris@10 1670 {
Chris@10 1671 E T4X, T5p, T7D, T7J, T54, T7y, T5z, T5D, T5c, T5m, T5s, T7I, T5w, T5C, T5j;
Chris@10 1672 E T5n, T4W, T7z;
Chris@10 1673 T4W = KP707106781 * (T4U + T4V);
Chris@10 1674 T4X = T4T - T4W;
Chris@10 1675 T5p = T4T + T4W;
Chris@10 1676 T7z = KP707106781 * (T3a + T3f);
Chris@10 1677 T7D = T7z + T7C;
Chris@10 1678 T7J = T7C - T7z;
Chris@10 1679 {
Chris@10 1680 E T50, T53, T5x, T5y;
Chris@10 1681 T50 = FNMS(KP382683432, T4Z, KP923879532 * T4Y);
Chris@10 1682 T53 = FMA(KP923879532, T51, KP382683432 * T52);
Chris@10 1683 T54 = T50 - T53;
Chris@10 1684 T7y = T50 + T53;
Chris@10 1685 T5x = T5d + T5e;
Chris@10 1686 T5y = T5g + T5h;
Chris@10 1687 T5z = FNMS(KP195090322, T5y, KP980785280 * T5x);
Chris@10 1688 T5D = FMA(KP195090322, T5x, KP980785280 * T5y);
Chris@10 1689 }
Chris@10 1690 {
Chris@10 1691 E T58, T5b, T5q, T5r;
Chris@10 1692 T58 = T56 - T57;
Chris@10 1693 T5b = T59 - T5a;
Chris@10 1694 T5c = FMA(KP555570233, T58, KP831469612 * T5b);
Chris@10 1695 T5m = FNMS(KP831469612, T58, KP555570233 * T5b);
Chris@10 1696 T5q = FMA(KP382683432, T4Y, KP923879532 * T4Z);
Chris@10 1697 T5r = FNMS(KP382683432, T51, KP923879532 * T52);
Chris@10 1698 T5s = T5q + T5r;
Chris@10 1699 T7I = T5r - T5q;
Chris@10 1700 }
Chris@10 1701 {
Chris@10 1702 E T5u, T5v, T5f, T5i;
Chris@10 1703 T5u = T56 + T57;
Chris@10 1704 T5v = T59 + T5a;
Chris@10 1705 T5w = FMA(KP980785280, T5u, KP195090322 * T5v);
Chris@10 1706 T5C = FNMS(KP195090322, T5u, KP980785280 * T5v);
Chris@10 1707 T5f = T5d - T5e;
Chris@10 1708 T5i = T5g - T5h;
Chris@10 1709 T5j = FNMS(KP831469612, T5i, KP555570233 * T5f);
Chris@10 1710 T5n = FMA(KP831469612, T5f, KP555570233 * T5i);
Chris@10 1711 }
Chris@10 1712 {
Chris@10 1713 E T55, T5k, T7H, T7K;
Chris@10 1714 T55 = T4X + T54;
Chris@10 1715 T5k = T5c + T5j;
Chris@10 1716 ri[WS(rs, 21)] = T55 - T5k;
Chris@10 1717 ri[WS(rs, 5)] = T55 + T5k;
Chris@10 1718 T7H = T5m + T5n;
Chris@10 1719 T7K = T7I + T7J;
Chris@10 1720 ii[WS(rs, 5)] = T7H + T7K;
Chris@10 1721 ii[WS(rs, 21)] = T7K - T7H;
Chris@10 1722 }
Chris@10 1723 {
Chris@10 1724 E T5l, T5o, T7L, T7M;
Chris@10 1725 T5l = T4X - T54;
Chris@10 1726 T5o = T5m - T5n;
Chris@10 1727 ri[WS(rs, 29)] = T5l - T5o;
Chris@10 1728 ri[WS(rs, 13)] = T5l + T5o;
Chris@10 1729 T7L = T5j - T5c;
Chris@10 1730 T7M = T7J - T7I;
Chris@10 1731 ii[WS(rs, 13)] = T7L + T7M;
Chris@10 1732 ii[WS(rs, 29)] = T7M - T7L;
Chris@10 1733 }
Chris@10 1734 {
Chris@10 1735 E T5t, T5A, T7x, T7E;
Chris@10 1736 T5t = T5p + T5s;
Chris@10 1737 T5A = T5w + T5z;
Chris@10 1738 ri[WS(rs, 17)] = T5t - T5A;
Chris@10 1739 ri[WS(rs, 1)] = T5t + T5A;
Chris@10 1740 T7x = T5C + T5D;
Chris@10 1741 T7E = T7y + T7D;
Chris@10 1742 ii[WS(rs, 1)] = T7x + T7E;
Chris@10 1743 ii[WS(rs, 17)] = T7E - T7x;
Chris@10 1744 }
Chris@10 1745 {
Chris@10 1746 E T5B, T5E, T7F, T7G;
Chris@10 1747 T5B = T5p - T5s;
Chris@10 1748 T5E = T5C - T5D;
Chris@10 1749 ri[WS(rs, 25)] = T5B - T5E;
Chris@10 1750 ri[WS(rs, 9)] = T5B + T5E;
Chris@10 1751 T7F = T5z - T5w;
Chris@10 1752 T7G = T7D - T7y;
Chris@10 1753 ii[WS(rs, 9)] = T7F + T7G;
Chris@10 1754 ii[WS(rs, 25)] = T7G - T7F;
Chris@10 1755 }
Chris@10 1756 }
Chris@10 1757 }
Chris@10 1758 }
Chris@10 1759 }
Chris@10 1760
Chris@10 1761 static const tw_instr twinstr[] = {
Chris@10 1762 {TW_FULL, 0, 32},
Chris@10 1763 {TW_NEXT, 1, 0}
Chris@10 1764 };
Chris@10 1765
Chris@10 1766 static const ct_desc desc = { 32, "t1_32", twinstr, &GENUS, {340, 114, 94, 0}, 0, 0, 0 };
Chris@10 1767
Chris@10 1768 void X(codelet_t1_32) (planner *p) {
Chris@10 1769 X(kdft_dit_register) (p, t1_32, &desc);
Chris@10 1770 }
Chris@10 1771 #endif /* HAVE_FMA */