annotate src/fftw-3.3.8/rdft/scalar/r2cf/hf_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:33 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hf_20 -include rdft/scalar/hf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 246 FP additions, 148 FP multiplications,
Chris@82 32 * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
Chris@82 33 * 61 stack variables, 4 constants, and 80 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hf.h"
Chris@82 36
Chris@82 37 static void hf_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 46 E T8, T4N, T2i, T4q, Tl, T4O, T2n, T4r, TN, T2b, T43, T4b, T2v, T3v, T3a;
Chris@82 47 E T3F, T27, T2f, T3T, T4f, T2R, T3z, T3i, T3J, T1G, T2e, T3W, T4e, T2K, T3y;
Chris@82 48 E T3p, T3I, T1e, T2c, T40, T4c, T2C, T3w, T33, T3G;
Chris@82 49 {
Chris@82 50 E T1, T4p, T3, T6, T4, T4n, T2, T7, T4o, T5;
Chris@82 51 T1 = cr[0];
Chris@82 52 T4p = ci[0];
Chris@82 53 T3 = cr[WS(rs, 10)];
Chris@82 54 T6 = ci[WS(rs, 10)];
Chris@82 55 T2 = W[18];
Chris@82 56 T4 = T2 * T3;
Chris@82 57 T4n = T2 * T6;
Chris@82 58 T5 = W[19];
Chris@82 59 T7 = FMA(T5, T6, T4);
Chris@82 60 T4o = FNMS(T5, T3, T4n);
Chris@82 61 T8 = T1 + T7;
Chris@82 62 T4N = T4p - T4o;
Chris@82 63 T2i = T1 - T7;
Chris@82 64 T4q = T4o + T4p;
Chris@82 65 }
Chris@82 66 {
Chris@82 67 E Ta, Td, Tb, T2j, Tg, Tj, Th, T2l, T9, Tf;
Chris@82 68 Ta = cr[WS(rs, 5)];
Chris@82 69 Td = ci[WS(rs, 5)];
Chris@82 70 T9 = W[8];
Chris@82 71 Tb = T9 * Ta;
Chris@82 72 T2j = T9 * Td;
Chris@82 73 Tg = cr[WS(rs, 15)];
Chris@82 74 Tj = ci[WS(rs, 15)];
Chris@82 75 Tf = W[28];
Chris@82 76 Th = Tf * Tg;
Chris@82 77 T2l = Tf * Tj;
Chris@82 78 {
Chris@82 79 E Te, T2k, Tk, T2m, Tc, Ti;
Chris@82 80 Tc = W[9];
Chris@82 81 Te = FMA(Tc, Td, Tb);
Chris@82 82 T2k = FNMS(Tc, Ta, T2j);
Chris@82 83 Ti = W[29];
Chris@82 84 Tk = FMA(Ti, Tj, Th);
Chris@82 85 T2m = FNMS(Ti, Tg, T2l);
Chris@82 86 Tl = Te + Tk;
Chris@82 87 T4O = Te - Tk;
Chris@82 88 T2n = T2k - T2m;
Chris@82 89 T4r = T2k + T2m;
Chris@82 90 }
Chris@82 91 }
Chris@82 92 {
Chris@82 93 E Ts, T36, TL, T2t, Ty, T38, TF, T2r;
Chris@82 94 {
Chris@82 95 E To, Tr, Tp, T35, Tn, Tq;
Chris@82 96 To = cr[WS(rs, 4)];
Chris@82 97 Tr = ci[WS(rs, 4)];
Chris@82 98 Tn = W[6];
Chris@82 99 Tp = Tn * To;
Chris@82 100 T35 = Tn * Tr;
Chris@82 101 Tq = W[7];
Chris@82 102 Ts = FMA(Tq, Tr, Tp);
Chris@82 103 T36 = FNMS(Tq, To, T35);
Chris@82 104 }
Chris@82 105 {
Chris@82 106 E TH, TK, TI, T2s, TG, TJ;
Chris@82 107 TH = cr[WS(rs, 19)];
Chris@82 108 TK = ci[WS(rs, 19)];
Chris@82 109 TG = W[36];
Chris@82 110 TI = TG * TH;
Chris@82 111 T2s = TG * TK;
Chris@82 112 TJ = W[37];
Chris@82 113 TL = FMA(TJ, TK, TI);
Chris@82 114 T2t = FNMS(TJ, TH, T2s);
Chris@82 115 }
Chris@82 116 {
Chris@82 117 E Tu, Tx, Tv, T37, Tt, Tw;
Chris@82 118 Tu = cr[WS(rs, 14)];
Chris@82 119 Tx = ci[WS(rs, 14)];
Chris@82 120 Tt = W[26];
Chris@82 121 Tv = Tt * Tu;
Chris@82 122 T37 = Tt * Tx;
Chris@82 123 Tw = W[27];
Chris@82 124 Ty = FMA(Tw, Tx, Tv);
Chris@82 125 T38 = FNMS(Tw, Tu, T37);
Chris@82 126 }
Chris@82 127 {
Chris@82 128 E TB, TE, TC, T2q, TA, TD;
Chris@82 129 TB = cr[WS(rs, 9)];
Chris@82 130 TE = ci[WS(rs, 9)];
Chris@82 131 TA = W[16];
Chris@82 132 TC = TA * TB;
Chris@82 133 T2q = TA * TE;
Chris@82 134 TD = W[17];
Chris@82 135 TF = FMA(TD, TE, TC);
Chris@82 136 T2r = FNMS(TD, TB, T2q);
Chris@82 137 }
Chris@82 138 {
Chris@82 139 E Tz, TM, T41, T42;
Chris@82 140 Tz = Ts + Ty;
Chris@82 141 TM = TF + TL;
Chris@82 142 TN = Tz - TM;
Chris@82 143 T2b = Tz + TM;
Chris@82 144 T41 = T2r + T2t;
Chris@82 145 T42 = T36 + T38;
Chris@82 146 T43 = T41 - T42;
Chris@82 147 T4b = T42 + T41;
Chris@82 148 }
Chris@82 149 {
Chris@82 150 E T2p, T2u, T34, T39;
Chris@82 151 T2p = Ts - Ty;
Chris@82 152 T2u = T2r - T2t;
Chris@82 153 T2v = T2p - T2u;
Chris@82 154 T3v = T2p + T2u;
Chris@82 155 T34 = TL - TF;
Chris@82 156 T39 = T36 - T38;
Chris@82 157 T3a = T34 - T39;
Chris@82 158 T3F = T39 + T34;
Chris@82 159 }
Chris@82 160 }
Chris@82 161 {
Chris@82 162 E T1M, T3e, T25, T2P, T1S, T3g, T1Z, T2N;
Chris@82 163 {
Chris@82 164 E T1I, T1L, T1J, T3d, T1H, T1K;
Chris@82 165 T1I = cr[WS(rs, 12)];
Chris@82 166 T1L = ci[WS(rs, 12)];
Chris@82 167 T1H = W[22];
Chris@82 168 T1J = T1H * T1I;
Chris@82 169 T3d = T1H * T1L;
Chris@82 170 T1K = W[23];
Chris@82 171 T1M = FMA(T1K, T1L, T1J);
Chris@82 172 T3e = FNMS(T1K, T1I, T3d);
Chris@82 173 }
Chris@82 174 {
Chris@82 175 E T21, T24, T22, T2O, T20, T23;
Chris@82 176 T21 = cr[WS(rs, 7)];
Chris@82 177 T24 = ci[WS(rs, 7)];
Chris@82 178 T20 = W[12];
Chris@82 179 T22 = T20 * T21;
Chris@82 180 T2O = T20 * T24;
Chris@82 181 T23 = W[13];
Chris@82 182 T25 = FMA(T23, T24, T22);
Chris@82 183 T2P = FNMS(T23, T21, T2O);
Chris@82 184 }
Chris@82 185 {
Chris@82 186 E T1O, T1R, T1P, T3f, T1N, T1Q;
Chris@82 187 T1O = cr[WS(rs, 2)];
Chris@82 188 T1R = ci[WS(rs, 2)];
Chris@82 189 T1N = W[2];
Chris@82 190 T1P = T1N * T1O;
Chris@82 191 T3f = T1N * T1R;
Chris@82 192 T1Q = W[3];
Chris@82 193 T1S = FMA(T1Q, T1R, T1P);
Chris@82 194 T3g = FNMS(T1Q, T1O, T3f);
Chris@82 195 }
Chris@82 196 {
Chris@82 197 E T1V, T1Y, T1W, T2M, T1U, T1X;
Chris@82 198 T1V = cr[WS(rs, 17)];
Chris@82 199 T1Y = ci[WS(rs, 17)];
Chris@82 200 T1U = W[32];
Chris@82 201 T1W = T1U * T1V;
Chris@82 202 T2M = T1U * T1Y;
Chris@82 203 T1X = W[33];
Chris@82 204 T1Z = FMA(T1X, T1Y, T1W);
Chris@82 205 T2N = FNMS(T1X, T1V, T2M);
Chris@82 206 }
Chris@82 207 {
Chris@82 208 E T1T, T26, T3R, T3S;
Chris@82 209 T1T = T1M + T1S;
Chris@82 210 T26 = T1Z + T25;
Chris@82 211 T27 = T1T - T26;
Chris@82 212 T2f = T1T + T26;
Chris@82 213 T3R = T2N + T2P;
Chris@82 214 T3S = T3e + T3g;
Chris@82 215 T3T = T3R - T3S;
Chris@82 216 T4f = T3S + T3R;
Chris@82 217 }
Chris@82 218 {
Chris@82 219 E T2L, T2Q, T3c, T3h;
Chris@82 220 T2L = T1M - T1S;
Chris@82 221 T2Q = T2N - T2P;
Chris@82 222 T2R = T2L - T2Q;
Chris@82 223 T3z = T2L + T2Q;
Chris@82 224 T3c = T25 - T1Z;
Chris@82 225 T3h = T3e - T3g;
Chris@82 226 T3i = T3c - T3h;
Chris@82 227 T3J = T3h + T3c;
Chris@82 228 }
Chris@82 229 }
Chris@82 230 {
Chris@82 231 E T1l, T3l, T1E, T2I, T1r, T3n, T1y, T2G;
Chris@82 232 {
Chris@82 233 E T1h, T1k, T1i, T3k, T1g, T1j;
Chris@82 234 T1h = cr[WS(rs, 8)];
Chris@82 235 T1k = ci[WS(rs, 8)];
Chris@82 236 T1g = W[14];
Chris@82 237 T1i = T1g * T1h;
Chris@82 238 T3k = T1g * T1k;
Chris@82 239 T1j = W[15];
Chris@82 240 T1l = FMA(T1j, T1k, T1i);
Chris@82 241 T3l = FNMS(T1j, T1h, T3k);
Chris@82 242 }
Chris@82 243 {
Chris@82 244 E T1A, T1D, T1B, T2H, T1z, T1C;
Chris@82 245 T1A = cr[WS(rs, 3)];
Chris@82 246 T1D = ci[WS(rs, 3)];
Chris@82 247 T1z = W[4];
Chris@82 248 T1B = T1z * T1A;
Chris@82 249 T2H = T1z * T1D;
Chris@82 250 T1C = W[5];
Chris@82 251 T1E = FMA(T1C, T1D, T1B);
Chris@82 252 T2I = FNMS(T1C, T1A, T2H);
Chris@82 253 }
Chris@82 254 {
Chris@82 255 E T1n, T1q, T1o, T3m, T1m, T1p;
Chris@82 256 T1n = cr[WS(rs, 18)];
Chris@82 257 T1q = ci[WS(rs, 18)];
Chris@82 258 T1m = W[34];
Chris@82 259 T1o = T1m * T1n;
Chris@82 260 T3m = T1m * T1q;
Chris@82 261 T1p = W[35];
Chris@82 262 T1r = FMA(T1p, T1q, T1o);
Chris@82 263 T3n = FNMS(T1p, T1n, T3m);
Chris@82 264 }
Chris@82 265 {
Chris@82 266 E T1u, T1x, T1v, T2F, T1t, T1w;
Chris@82 267 T1u = cr[WS(rs, 13)];
Chris@82 268 T1x = ci[WS(rs, 13)];
Chris@82 269 T1t = W[24];
Chris@82 270 T1v = T1t * T1u;
Chris@82 271 T2F = T1t * T1x;
Chris@82 272 T1w = W[25];
Chris@82 273 T1y = FMA(T1w, T1x, T1v);
Chris@82 274 T2G = FNMS(T1w, T1u, T2F);
Chris@82 275 }
Chris@82 276 {
Chris@82 277 E T1s, T1F, T3U, T3V;
Chris@82 278 T1s = T1l + T1r;
Chris@82 279 T1F = T1y + T1E;
Chris@82 280 T1G = T1s - T1F;
Chris@82 281 T2e = T1s + T1F;
Chris@82 282 T3U = T2G + T2I;
Chris@82 283 T3V = T3l + T3n;
Chris@82 284 T3W = T3U - T3V;
Chris@82 285 T4e = T3V + T3U;
Chris@82 286 }
Chris@82 287 {
Chris@82 288 E T2E, T2J, T3j, T3o;
Chris@82 289 T2E = T1l - T1r;
Chris@82 290 T2J = T2G - T2I;
Chris@82 291 T2K = T2E - T2J;
Chris@82 292 T3y = T2E + T2J;
Chris@82 293 T3j = T1E - T1y;
Chris@82 294 T3o = T3l - T3n;
Chris@82 295 T3p = T3j - T3o;
Chris@82 296 T3I = T3o + T3j;
Chris@82 297 }
Chris@82 298 }
Chris@82 299 {
Chris@82 300 E TT, T2Z, T1c, T2A, TZ, T31, T16, T2y;
Chris@82 301 {
Chris@82 302 E TP, TS, TQ, T2Y, TO, TR;
Chris@82 303 TP = cr[WS(rs, 16)];
Chris@82 304 TS = ci[WS(rs, 16)];
Chris@82 305 TO = W[30];
Chris@82 306 TQ = TO * TP;
Chris@82 307 T2Y = TO * TS;
Chris@82 308 TR = W[31];
Chris@82 309 TT = FMA(TR, TS, TQ);
Chris@82 310 T2Z = FNMS(TR, TP, T2Y);
Chris@82 311 }
Chris@82 312 {
Chris@82 313 E T18, T1b, T19, T2z, T17, T1a;
Chris@82 314 T18 = cr[WS(rs, 11)];
Chris@82 315 T1b = ci[WS(rs, 11)];
Chris@82 316 T17 = W[20];
Chris@82 317 T19 = T17 * T18;
Chris@82 318 T2z = T17 * T1b;
Chris@82 319 T1a = W[21];
Chris@82 320 T1c = FMA(T1a, T1b, T19);
Chris@82 321 T2A = FNMS(T1a, T18, T2z);
Chris@82 322 }
Chris@82 323 {
Chris@82 324 E TV, TY, TW, T30, TU, TX;
Chris@82 325 TV = cr[WS(rs, 6)];
Chris@82 326 TY = ci[WS(rs, 6)];
Chris@82 327 TU = W[10];
Chris@82 328 TW = TU * TV;
Chris@82 329 T30 = TU * TY;
Chris@82 330 TX = W[11];
Chris@82 331 TZ = FMA(TX, TY, TW);
Chris@82 332 T31 = FNMS(TX, TV, T30);
Chris@82 333 }
Chris@82 334 {
Chris@82 335 E T12, T15, T13, T2x, T11, T14;
Chris@82 336 T12 = cr[WS(rs, 1)];
Chris@82 337 T15 = ci[WS(rs, 1)];
Chris@82 338 T11 = W[0];
Chris@82 339 T13 = T11 * T12;
Chris@82 340 T2x = T11 * T15;
Chris@82 341 T14 = W[1];
Chris@82 342 T16 = FMA(T14, T15, T13);
Chris@82 343 T2y = FNMS(T14, T12, T2x);
Chris@82 344 }
Chris@82 345 {
Chris@82 346 E T10, T1d, T3Y, T3Z;
Chris@82 347 T10 = TT + TZ;
Chris@82 348 T1d = T16 + T1c;
Chris@82 349 T1e = T10 - T1d;
Chris@82 350 T2c = T10 + T1d;
Chris@82 351 T3Y = T2y + T2A;
Chris@82 352 T3Z = T2Z + T31;
Chris@82 353 T40 = T3Y - T3Z;
Chris@82 354 T4c = T3Z + T3Y;
Chris@82 355 }
Chris@82 356 {
Chris@82 357 E T2w, T2B, T2X, T32;
Chris@82 358 T2w = TT - TZ;
Chris@82 359 T2B = T2y - T2A;
Chris@82 360 T2C = T2w - T2B;
Chris@82 361 T3w = T2w + T2B;
Chris@82 362 T2X = T1c - T16;
Chris@82 363 T32 = T2Z - T31;
Chris@82 364 T33 = T2X - T32;
Chris@82 365 T3G = T32 + T2X;
Chris@82 366 }
Chris@82 367 }
Chris@82 368 {
Chris@82 369 E T45, T47, Tm, T29, T3O, T3P, T46, T3Q;
Chris@82 370 {
Chris@82 371 E T3X, T44, T1f, T28;
Chris@82 372 T3X = T3T - T3W;
Chris@82 373 T44 = T40 - T43;
Chris@82 374 T45 = FNMS(KP618033988, T44, T3X);
Chris@82 375 T47 = FMA(KP618033988, T3X, T44);
Chris@82 376 Tm = T8 - Tl;
Chris@82 377 T1f = TN + T1e;
Chris@82 378 T28 = T1G + T27;
Chris@82 379 T29 = T1f + T28;
Chris@82 380 T3O = FNMS(KP250000000, T29, Tm);
Chris@82 381 T3P = T1f - T28;
Chris@82 382 }
Chris@82 383 ci[WS(rs, 9)] = Tm + T29;
Chris@82 384 T46 = FMA(KP559016994, T3P, T3O);
Chris@82 385 ci[WS(rs, 5)] = FNMS(KP951056516, T47, T46);
Chris@82 386 cr[WS(rs, 6)] = FMA(KP951056516, T47, T46);
Chris@82 387 T3Q = FNMS(KP559016994, T3P, T3O);
Chris@82 388 cr[WS(rs, 2)] = FNMS(KP951056516, T45, T3Q);
Chris@82 389 ci[WS(rs, 1)] = FMA(KP951056516, T45, T3Q);
Chris@82 390 }
Chris@82 391 {
Chris@82 392 E T3L, T3N, T3u, T3B, T3C, T3D, T3M, T3E;
Chris@82 393 {
Chris@82 394 E T3H, T3K, T3x, T3A;
Chris@82 395 T3H = T3F - T3G;
Chris@82 396 T3K = T3I - T3J;
Chris@82 397 T3L = FMA(KP618033988, T3K, T3H);
Chris@82 398 T3N = FNMS(KP618033988, T3H, T3K);
Chris@82 399 T3u = T2i + T2n;
Chris@82 400 T3x = T3v + T3w;
Chris@82 401 T3A = T3y + T3z;
Chris@82 402 T3B = T3x + T3A;
Chris@82 403 T3C = FNMS(KP250000000, T3B, T3u);
Chris@82 404 T3D = T3x - T3A;
Chris@82 405 }
Chris@82 406 cr[WS(rs, 5)] = T3u + T3B;
Chris@82 407 T3M = FNMS(KP559016994, T3D, T3C);
Chris@82 408 ci[WS(rs, 2)] = FNMS(KP951056516, T3N, T3M);
Chris@82 409 ci[WS(rs, 6)] = FMA(KP951056516, T3N, T3M);
Chris@82 410 T3E = FMA(KP559016994, T3D, T3C);
Chris@82 411 cr[WS(rs, 1)] = FMA(KP951056516, T3L, T3E);
Chris@82 412 cr[WS(rs, 9)] = FNMS(KP951056516, T3L, T3E);
Chris@82 413 }
Chris@82 414 {
Chris@82 415 E T4h, T4j, T2a, T2h, T48, T49, T4i, T4a;
Chris@82 416 {
Chris@82 417 E T4d, T4g, T2d, T2g;
Chris@82 418 T4d = T4b - T4c;
Chris@82 419 T4g = T4e - T4f;
Chris@82 420 T4h = FMA(KP618033988, T4g, T4d);
Chris@82 421 T4j = FNMS(KP618033988, T4d, T4g);
Chris@82 422 T2a = T8 + Tl;
Chris@82 423 T2d = T2b + T2c;
Chris@82 424 T2g = T2e + T2f;
Chris@82 425 T2h = T2d + T2g;
Chris@82 426 T48 = FNMS(KP250000000, T2h, T2a);
Chris@82 427 T49 = T2d - T2g;
Chris@82 428 }
Chris@82 429 cr[0] = T2a + T2h;
Chris@82 430 T4i = FNMS(KP559016994, T49, T48);
Chris@82 431 ci[WS(rs, 7)] = FNMS(KP951056516, T4j, T4i);
Chris@82 432 cr[WS(rs, 8)] = FMA(KP951056516, T4j, T4i);
Chris@82 433 T4a = FMA(KP559016994, T49, T48);
Chris@82 434 cr[WS(rs, 4)] = FNMS(KP951056516, T4h, T4a);
Chris@82 435 ci[WS(rs, 3)] = FMA(KP951056516, T4h, T4a);
Chris@82 436 }
Chris@82 437 {
Chris@82 438 E T3r, T3t, T2o, T2T, T2U, T2V, T3s, T2W;
Chris@82 439 {
Chris@82 440 E T3b, T3q, T2D, T2S;
Chris@82 441 T3b = T33 - T3a;
Chris@82 442 T3q = T3i - T3p;
Chris@82 443 T3r = FMA(KP618033988, T3q, T3b);
Chris@82 444 T3t = FNMS(KP618033988, T3b, T3q);
Chris@82 445 T2o = T2i - T2n;
Chris@82 446 T2D = T2v + T2C;
Chris@82 447 T2S = T2K + T2R;
Chris@82 448 T2T = T2D + T2S;
Chris@82 449 T2U = FNMS(KP250000000, T2T, T2o);
Chris@82 450 T2V = T2D - T2S;
Chris@82 451 }
Chris@82 452 ci[WS(rs, 4)] = T2o + T2T;
Chris@82 453 T3s = FNMS(KP559016994, T2V, T2U);
Chris@82 454 cr[WS(rs, 3)] = FMA(KP951056516, T3t, T3s);
Chris@82 455 cr[WS(rs, 7)] = FNMS(KP951056516, T3t, T3s);
Chris@82 456 T2W = FMA(KP559016994, T2V, T2U);
Chris@82 457 ci[0] = FNMS(KP951056516, T3r, T2W);
Chris@82 458 ci[WS(rs, 8)] = FMA(KP951056516, T3r, T2W);
Chris@82 459 }
Chris@82 460 {
Chris@82 461 E T4y, T4A, T4s, T4m, T4t, T4u, T4z, T4v;
Chris@82 462 {
Chris@82 463 E T4w, T4x, T4k, T4l;
Chris@82 464 T4w = T1e - TN;
Chris@82 465 T4x = T1G - T27;
Chris@82 466 T4y = FNMS(KP618033988, T4x, T4w);
Chris@82 467 T4A = FMA(KP618033988, T4w, T4x);
Chris@82 468 T4s = T4q - T4r;
Chris@82 469 T4k = T43 + T40;
Chris@82 470 T4l = T3W + T3T;
Chris@82 471 T4m = T4k + T4l;
Chris@82 472 T4t = FMA(KP250000000, T4m, T4s);
Chris@82 473 T4u = T4l - T4k;
Chris@82 474 }
Chris@82 475 cr[WS(rs, 10)] = T4m - T4s;
Chris@82 476 T4z = FNMS(KP559016994, T4u, T4t);
Chris@82 477 cr[WS(rs, 18)] = FMS(KP951056516, T4A, T4z);
Chris@82 478 ci[WS(rs, 17)] = FMA(KP951056516, T4A, T4z);
Chris@82 479 T4v = FMA(KP559016994, T4u, T4t);
Chris@82 480 cr[WS(rs, 14)] = FMS(KP951056516, T4y, T4v);
Chris@82 481 ci[WS(rs, 13)] = FMA(KP951056516, T4y, T4v);
Chris@82 482 }
Chris@82 483 {
Chris@82 484 E T4Y, T50, T4P, T4S, T4T, T4U, T4Z, T4V;
Chris@82 485 {
Chris@82 486 E T4W, T4X, T4Q, T4R;
Chris@82 487 T4W = T3y - T3z;
Chris@82 488 T4X = T3v - T3w;
Chris@82 489 T4Y = FNMS(KP618033988, T4X, T4W);
Chris@82 490 T50 = FMA(KP618033988, T4W, T4X);
Chris@82 491 T4P = T4N - T4O;
Chris@82 492 T4Q = T3F + T3G;
Chris@82 493 T4R = T3I + T3J;
Chris@82 494 T4S = T4Q + T4R;
Chris@82 495 T4T = FNMS(KP250000000, T4S, T4P);
Chris@82 496 T4U = T4Q - T4R;
Chris@82 497 }
Chris@82 498 ci[WS(rs, 14)] = T4S + T4P;
Chris@82 499 T4Z = FMA(KP559016994, T4U, T4T);
Chris@82 500 ci[WS(rs, 10)] = FMA(KP951056516, T50, T4Z);
Chris@82 501 ci[WS(rs, 18)] = FNMS(KP951056516, T50, T4Z);
Chris@82 502 T4V = FNMS(KP559016994, T4U, T4T);
Chris@82 503 cr[WS(rs, 13)] = FMS(KP951056516, T4Y, T4V);
Chris@82 504 cr[WS(rs, 17)] = -(FMA(KP951056516, T4Y, T4V));
Chris@82 505 }
Chris@82 506 {
Chris@82 507 E T4K, T4M, T4B, T4E, T4F, T4G, T4L, T4H;
Chris@82 508 {
Chris@82 509 E T4I, T4J, T4C, T4D;
Chris@82 510 T4I = T2f - T2e;
Chris@82 511 T4J = T2b - T2c;
Chris@82 512 T4K = FMA(KP618033988, T4J, T4I);
Chris@82 513 T4M = FNMS(KP618033988, T4I, T4J);
Chris@82 514 T4B = T4r + T4q;
Chris@82 515 T4C = T4b + T4c;
Chris@82 516 T4D = T4e + T4f;
Chris@82 517 T4E = T4C + T4D;
Chris@82 518 T4F = FNMS(KP250000000, T4E, T4B);
Chris@82 519 T4G = T4C - T4D;
Chris@82 520 }
Chris@82 521 ci[WS(rs, 19)] = T4E + T4B;
Chris@82 522 T4L = FMA(KP559016994, T4G, T4F);
Chris@82 523 cr[WS(rs, 16)] = FMS(KP951056516, T4M, T4L);
Chris@82 524 ci[WS(rs, 15)] = FMA(KP951056516, T4M, T4L);
Chris@82 525 T4H = FNMS(KP559016994, T4G, T4F);
Chris@82 526 cr[WS(rs, 12)] = FMS(KP951056516, T4K, T4H);
Chris@82 527 ci[WS(rs, 11)] = FMA(KP951056516, T4K, T4H);
Chris@82 528 }
Chris@82 529 {
Chris@82 530 E T5a, T5c, T54, T53, T55, T56, T5b, T57;
Chris@82 531 {
Chris@82 532 E T58, T59, T51, T52;
Chris@82 533 T58 = T2v - T2C;
Chris@82 534 T59 = T2K - T2R;
Chris@82 535 T5a = FMA(KP618033988, T59, T58);
Chris@82 536 T5c = FNMS(KP618033988, T58, T59);
Chris@82 537 T54 = T4O + T4N;
Chris@82 538 T51 = T3a + T33;
Chris@82 539 T52 = T3p + T3i;
Chris@82 540 T53 = T51 + T52;
Chris@82 541 T55 = FMA(KP250000000, T53, T54);
Chris@82 542 T56 = T51 - T52;
Chris@82 543 }
Chris@82 544 cr[WS(rs, 15)] = T53 - T54;
Chris@82 545 T5b = FMA(KP559016994, T56, T55);
Chris@82 546 ci[WS(rs, 12)] = FMA(KP951056516, T5c, T5b);
Chris@82 547 ci[WS(rs, 16)] = FNMS(KP951056516, T5c, T5b);
Chris@82 548 T57 = FNMS(KP559016994, T56, T55);
Chris@82 549 cr[WS(rs, 11)] = FMS(KP951056516, T5a, T57);
Chris@82 550 cr[WS(rs, 19)] = -(FMA(KP951056516, T5a, T57));
Chris@82 551 }
Chris@82 552 }
Chris@82 553 }
Chris@82 554 }
Chris@82 555
Chris@82 556 static const tw_instr twinstr[] = {
Chris@82 557 {TW_FULL, 1, 20},
Chris@82 558 {TW_NEXT, 1, 0}
Chris@82 559 };
Chris@82 560
Chris@82 561 static const hc2hc_desc desc = { 20, "hf_20", twinstr, &GENUS, {136, 38, 110, 0} };
Chris@82 562
Chris@82 563 void X(codelet_hf_20) (planner *p) {
Chris@82 564 X(khc2hc_register) (p, hf_20, &desc);
Chris@82 565 }
Chris@82 566 #else
Chris@82 567
Chris@82 568 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hf_20 -include rdft/scalar/hf.h */
Chris@82 569
Chris@82 570 /*
Chris@82 571 * This function contains 246 FP additions, 124 FP multiplications,
Chris@82 572 * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
Chris@82 573 * 85 stack variables, 4 constants, and 80 memory accesses
Chris@82 574 */
Chris@82 575 #include "rdft/scalar/hf.h"
Chris@82 576
Chris@82 577 static void hf_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 578 {
Chris@82 579 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 580 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 581 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 582 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 583 {
Chris@82 584 INT m;
Chris@82 585 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 586 E Tj, T1R, T4j, T4s, T2q, T37, T3Q, T42, T1r, T1O, T1P, T3i, T3l, T3J, T3D;
Chris@82 587 E T3E, T44, T1V, T1W, T1X, T2e, T2j, T2k, T2W, T2X, T4f, T33, T34, T35, T2J;
Chris@82 588 E T2O, T4q, TG, T13, T14, T3p, T3s, T3K, T3A, T3B, T43, T1S, T1T, T1U, T23;
Chris@82 589 E T28, T29, T2T, T2U, T4e, T30, T31, T32, T2y, T2D, T4p;
Chris@82 590 {
Chris@82 591 E T1, T3N, T6, T3M, Tc, T2n, Th, T2o;
Chris@82 592 T1 = cr[0];
Chris@82 593 T3N = ci[0];
Chris@82 594 {
Chris@82 595 E T3, T5, T2, T4;
Chris@82 596 T3 = cr[WS(rs, 10)];
Chris@82 597 T5 = ci[WS(rs, 10)];
Chris@82 598 T2 = W[18];
Chris@82 599 T4 = W[19];
Chris@82 600 T6 = FMA(T2, T3, T4 * T5);
Chris@82 601 T3M = FNMS(T4, T3, T2 * T5);
Chris@82 602 }
Chris@82 603 {
Chris@82 604 E T9, Tb, T8, Ta;
Chris@82 605 T9 = cr[WS(rs, 5)];
Chris@82 606 Tb = ci[WS(rs, 5)];
Chris@82 607 T8 = W[8];
Chris@82 608 Ta = W[9];
Chris@82 609 Tc = FMA(T8, T9, Ta * Tb);
Chris@82 610 T2n = FNMS(Ta, T9, T8 * Tb);
Chris@82 611 }
Chris@82 612 {
Chris@82 613 E Te, Tg, Td, Tf;
Chris@82 614 Te = cr[WS(rs, 15)];
Chris@82 615 Tg = ci[WS(rs, 15)];
Chris@82 616 Td = W[28];
Chris@82 617 Tf = W[29];
Chris@82 618 Th = FMA(Td, Te, Tf * Tg);
Chris@82 619 T2o = FNMS(Tf, Te, Td * Tg);
Chris@82 620 }
Chris@82 621 {
Chris@82 622 E T7, Ti, T4h, T4i;
Chris@82 623 T7 = T1 + T6;
Chris@82 624 Ti = Tc + Th;
Chris@82 625 Tj = T7 - Ti;
Chris@82 626 T1R = T7 + Ti;
Chris@82 627 T4h = T3N - T3M;
Chris@82 628 T4i = Tc - Th;
Chris@82 629 T4j = T4h - T4i;
Chris@82 630 T4s = T4i + T4h;
Chris@82 631 }
Chris@82 632 {
Chris@82 633 E T2m, T2p, T3O, T3P;
Chris@82 634 T2m = T1 - T6;
Chris@82 635 T2p = T2n - T2o;
Chris@82 636 T2q = T2m - T2p;
Chris@82 637 T37 = T2m + T2p;
Chris@82 638 T3O = T3M + T3N;
Chris@82 639 T3P = T2n + T2o;
Chris@82 640 T3Q = T3O - T3P;
Chris@82 641 T42 = T3P + T3O;
Chris@82 642 }
Chris@82 643 }
Chris@82 644 {
Chris@82 645 E T1f, T3g, T2a, T2H, T1N, T3j, T2i, T2N, T1q, T3h, T2d, T2I, T1C, T3k, T2f;
Chris@82 646 E T2M;
Chris@82 647 {
Chris@82 648 E T19, T2F, T1e, T2G;
Chris@82 649 {
Chris@82 650 E T16, T18, T15, T17;
Chris@82 651 T16 = cr[WS(rs, 8)];
Chris@82 652 T18 = ci[WS(rs, 8)];
Chris@82 653 T15 = W[14];
Chris@82 654 T17 = W[15];
Chris@82 655 T19 = FMA(T15, T16, T17 * T18);
Chris@82 656 T2F = FNMS(T17, T16, T15 * T18);
Chris@82 657 }
Chris@82 658 {
Chris@82 659 E T1b, T1d, T1a, T1c;
Chris@82 660 T1b = cr[WS(rs, 18)];
Chris@82 661 T1d = ci[WS(rs, 18)];
Chris@82 662 T1a = W[34];
Chris@82 663 T1c = W[35];
Chris@82 664 T1e = FMA(T1a, T1b, T1c * T1d);
Chris@82 665 T2G = FNMS(T1c, T1b, T1a * T1d);
Chris@82 666 }
Chris@82 667 T1f = T19 + T1e;
Chris@82 668 T3g = T2F + T2G;
Chris@82 669 T2a = T19 - T1e;
Chris@82 670 T2H = T2F - T2G;
Chris@82 671 }
Chris@82 672 {
Chris@82 673 E T1H, T2g, T1M, T2h;
Chris@82 674 {
Chris@82 675 E T1E, T1G, T1D, T1F;
Chris@82 676 T1E = cr[WS(rs, 17)];
Chris@82 677 T1G = ci[WS(rs, 17)];
Chris@82 678 T1D = W[32];
Chris@82 679 T1F = W[33];
Chris@82 680 T1H = FMA(T1D, T1E, T1F * T1G);
Chris@82 681 T2g = FNMS(T1F, T1E, T1D * T1G);
Chris@82 682 }
Chris@82 683 {
Chris@82 684 E T1J, T1L, T1I, T1K;
Chris@82 685 T1J = cr[WS(rs, 7)];
Chris@82 686 T1L = ci[WS(rs, 7)];
Chris@82 687 T1I = W[12];
Chris@82 688 T1K = W[13];
Chris@82 689 T1M = FMA(T1I, T1J, T1K * T1L);
Chris@82 690 T2h = FNMS(T1K, T1J, T1I * T1L);
Chris@82 691 }
Chris@82 692 T1N = T1H + T1M;
Chris@82 693 T3j = T2g + T2h;
Chris@82 694 T2i = T2g - T2h;
Chris@82 695 T2N = T1H - T1M;
Chris@82 696 }
Chris@82 697 {
Chris@82 698 E T1k, T2b, T1p, T2c;
Chris@82 699 {
Chris@82 700 E T1h, T1j, T1g, T1i;
Chris@82 701 T1h = cr[WS(rs, 13)];
Chris@82 702 T1j = ci[WS(rs, 13)];
Chris@82 703 T1g = W[24];
Chris@82 704 T1i = W[25];
Chris@82 705 T1k = FMA(T1g, T1h, T1i * T1j);
Chris@82 706 T2b = FNMS(T1i, T1h, T1g * T1j);
Chris@82 707 }
Chris@82 708 {
Chris@82 709 E T1m, T1o, T1l, T1n;
Chris@82 710 T1m = cr[WS(rs, 3)];
Chris@82 711 T1o = ci[WS(rs, 3)];
Chris@82 712 T1l = W[4];
Chris@82 713 T1n = W[5];
Chris@82 714 T1p = FMA(T1l, T1m, T1n * T1o);
Chris@82 715 T2c = FNMS(T1n, T1m, T1l * T1o);
Chris@82 716 }
Chris@82 717 T1q = T1k + T1p;
Chris@82 718 T3h = T2b + T2c;
Chris@82 719 T2d = T2b - T2c;
Chris@82 720 T2I = T1k - T1p;
Chris@82 721 }
Chris@82 722 {
Chris@82 723 E T1w, T2K, T1B, T2L;
Chris@82 724 {
Chris@82 725 E T1t, T1v, T1s, T1u;
Chris@82 726 T1t = cr[WS(rs, 12)];
Chris@82 727 T1v = ci[WS(rs, 12)];
Chris@82 728 T1s = W[22];
Chris@82 729 T1u = W[23];
Chris@82 730 T1w = FMA(T1s, T1t, T1u * T1v);
Chris@82 731 T2K = FNMS(T1u, T1t, T1s * T1v);
Chris@82 732 }
Chris@82 733 {
Chris@82 734 E T1y, T1A, T1x, T1z;
Chris@82 735 T1y = cr[WS(rs, 2)];
Chris@82 736 T1A = ci[WS(rs, 2)];
Chris@82 737 T1x = W[2];
Chris@82 738 T1z = W[3];
Chris@82 739 T1B = FMA(T1x, T1y, T1z * T1A);
Chris@82 740 T2L = FNMS(T1z, T1y, T1x * T1A);
Chris@82 741 }
Chris@82 742 T1C = T1w + T1B;
Chris@82 743 T3k = T2K + T2L;
Chris@82 744 T2f = T1w - T1B;
Chris@82 745 T2M = T2K - T2L;
Chris@82 746 }
Chris@82 747 T1r = T1f - T1q;
Chris@82 748 T1O = T1C - T1N;
Chris@82 749 T1P = T1r + T1O;
Chris@82 750 T3i = T3g - T3h;
Chris@82 751 T3l = T3j - T3k;
Chris@82 752 T3J = T3l - T3i;
Chris@82 753 T3D = T3g + T3h;
Chris@82 754 T3E = T3k + T3j;
Chris@82 755 T44 = T3D + T3E;
Chris@82 756 T1V = T1f + T1q;
Chris@82 757 T1W = T1C + T1N;
Chris@82 758 T1X = T1V + T1W;
Chris@82 759 T2e = T2a - T2d;
Chris@82 760 T2j = T2f - T2i;
Chris@82 761 T2k = T2e + T2j;
Chris@82 762 T2W = T2H - T2I;
Chris@82 763 T2X = T2M - T2N;
Chris@82 764 T4f = T2W + T2X;
Chris@82 765 T33 = T2a + T2d;
Chris@82 766 T34 = T2f + T2i;
Chris@82 767 T35 = T33 + T34;
Chris@82 768 T2J = T2H + T2I;
Chris@82 769 T2O = T2M + T2N;
Chris@82 770 T4q = T2J + T2O;
Chris@82 771 }
Chris@82 772 {
Chris@82 773 E Tu, T3n, T1Z, T2w, T12, T3r, T27, T2z, TF, T3o, T22, T2x, TR, T3q, T24;
Chris@82 774 E T2C;
Chris@82 775 {
Chris@82 776 E To, T2u, Tt, T2v;
Chris@82 777 {
Chris@82 778 E Tl, Tn, Tk, Tm;
Chris@82 779 Tl = cr[WS(rs, 4)];
Chris@82 780 Tn = ci[WS(rs, 4)];
Chris@82 781 Tk = W[6];
Chris@82 782 Tm = W[7];
Chris@82 783 To = FMA(Tk, Tl, Tm * Tn);
Chris@82 784 T2u = FNMS(Tm, Tl, Tk * Tn);
Chris@82 785 }
Chris@82 786 {
Chris@82 787 E Tq, Ts, Tp, Tr;
Chris@82 788 Tq = cr[WS(rs, 14)];
Chris@82 789 Ts = ci[WS(rs, 14)];
Chris@82 790 Tp = W[26];
Chris@82 791 Tr = W[27];
Chris@82 792 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@82 793 T2v = FNMS(Tr, Tq, Tp * Ts);
Chris@82 794 }
Chris@82 795 Tu = To + Tt;
Chris@82 796 T3n = T2u + T2v;
Chris@82 797 T1Z = To - Tt;
Chris@82 798 T2w = T2u - T2v;
Chris@82 799 }
Chris@82 800 {
Chris@82 801 E TW, T25, T11, T26;
Chris@82 802 {
Chris@82 803 E TT, TV, TS, TU;
Chris@82 804 TT = cr[WS(rs, 1)];
Chris@82 805 TV = ci[WS(rs, 1)];
Chris@82 806 TS = W[0];
Chris@82 807 TU = W[1];
Chris@82 808 TW = FMA(TS, TT, TU * TV);
Chris@82 809 T25 = FNMS(TU, TT, TS * TV);
Chris@82 810 }
Chris@82 811 {
Chris@82 812 E TY, T10, TX, TZ;
Chris@82 813 TY = cr[WS(rs, 11)];
Chris@82 814 T10 = ci[WS(rs, 11)];
Chris@82 815 TX = W[20];
Chris@82 816 TZ = W[21];
Chris@82 817 T11 = FMA(TX, TY, TZ * T10);
Chris@82 818 T26 = FNMS(TZ, TY, TX * T10);
Chris@82 819 }
Chris@82 820 T12 = TW + T11;
Chris@82 821 T3r = T25 + T26;
Chris@82 822 T27 = T25 - T26;
Chris@82 823 T2z = T11 - TW;
Chris@82 824 }
Chris@82 825 {
Chris@82 826 E Tz, T20, TE, T21;
Chris@82 827 {
Chris@82 828 E Tw, Ty, Tv, Tx;
Chris@82 829 Tw = cr[WS(rs, 9)];
Chris@82 830 Ty = ci[WS(rs, 9)];
Chris@82 831 Tv = W[16];
Chris@82 832 Tx = W[17];
Chris@82 833 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@82 834 T20 = FNMS(Tx, Tw, Tv * Ty);
Chris@82 835 }
Chris@82 836 {
Chris@82 837 E TB, TD, TA, TC;
Chris@82 838 TB = cr[WS(rs, 19)];
Chris@82 839 TD = ci[WS(rs, 19)];
Chris@82 840 TA = W[36];
Chris@82 841 TC = W[37];
Chris@82 842 TE = FMA(TA, TB, TC * TD);
Chris@82 843 T21 = FNMS(TC, TB, TA * TD);
Chris@82 844 }
Chris@82 845 TF = Tz + TE;
Chris@82 846 T3o = T20 + T21;
Chris@82 847 T22 = T20 - T21;
Chris@82 848 T2x = Tz - TE;
Chris@82 849 }
Chris@82 850 {
Chris@82 851 E TL, T2A, TQ, T2B;
Chris@82 852 {
Chris@82 853 E TI, TK, TH, TJ;
Chris@82 854 TI = cr[WS(rs, 16)];
Chris@82 855 TK = ci[WS(rs, 16)];
Chris@82 856 TH = W[30];
Chris@82 857 TJ = W[31];
Chris@82 858 TL = FMA(TH, TI, TJ * TK);
Chris@82 859 T2A = FNMS(TJ, TI, TH * TK);
Chris@82 860 }
Chris@82 861 {
Chris@82 862 E TN, TP, TM, TO;
Chris@82 863 TN = cr[WS(rs, 6)];
Chris@82 864 TP = ci[WS(rs, 6)];
Chris@82 865 TM = W[10];
Chris@82 866 TO = W[11];
Chris@82 867 TQ = FMA(TM, TN, TO * TP);
Chris@82 868 T2B = FNMS(TO, TN, TM * TP);
Chris@82 869 }
Chris@82 870 TR = TL + TQ;
Chris@82 871 T3q = T2A + T2B;
Chris@82 872 T24 = TL - TQ;
Chris@82 873 T2C = T2A - T2B;
Chris@82 874 }
Chris@82 875 TG = Tu - TF;
Chris@82 876 T13 = TR - T12;
Chris@82 877 T14 = TG + T13;
Chris@82 878 T3p = T3n - T3o;
Chris@82 879 T3s = T3q - T3r;
Chris@82 880 T3K = T3p + T3s;
Chris@82 881 T3A = T3n + T3o;
Chris@82 882 T3B = T3q + T3r;
Chris@82 883 T43 = T3A + T3B;
Chris@82 884 T1S = Tu + TF;
Chris@82 885 T1T = TR + T12;
Chris@82 886 T1U = T1S + T1T;
Chris@82 887 T23 = T1Z - T22;
Chris@82 888 T28 = T24 - T27;
Chris@82 889 T29 = T23 + T28;
Chris@82 890 T2T = T2w - T2x;
Chris@82 891 T2U = T2C + T2z;
Chris@82 892 T4e = T2T + T2U;
Chris@82 893 T30 = T1Z + T22;
Chris@82 894 T31 = T24 + T27;
Chris@82 895 T32 = T30 + T31;
Chris@82 896 T2y = T2w + T2x;
Chris@82 897 T2D = T2z - T2C;
Chris@82 898 T4p = T2D - T2y;
Chris@82 899 }
Chris@82 900 {
Chris@82 901 E T3e, T1Q, T3d, T3u, T3w, T3m, T3t, T3v, T3f;
Chris@82 902 T3e = KP559016994 * (T14 - T1P);
Chris@82 903 T1Q = T14 + T1P;
Chris@82 904 T3d = FNMS(KP250000000, T1Q, Tj);
Chris@82 905 T3m = T3i + T3l;
Chris@82 906 T3t = T3p - T3s;
Chris@82 907 T3u = FNMS(KP587785252, T3t, KP951056516 * T3m);
Chris@82 908 T3w = FMA(KP951056516, T3t, KP587785252 * T3m);
Chris@82 909 ci[WS(rs, 9)] = Tj + T1Q;
Chris@82 910 T3v = T3e + T3d;
Chris@82 911 ci[WS(rs, 5)] = T3v - T3w;
Chris@82 912 cr[WS(rs, 6)] = T3v + T3w;
Chris@82 913 T3f = T3d - T3e;
Chris@82 914 cr[WS(rs, 2)] = T3f - T3u;
Chris@82 915 ci[WS(rs, 1)] = T3f + T3u;
Chris@82 916 }
Chris@82 917 {
Chris@82 918 E T36, T38, T39, T2Z, T3c, T2V, T2Y, T3b, T3a;
Chris@82 919 T36 = KP559016994 * (T32 - T35);
Chris@82 920 T38 = T32 + T35;
Chris@82 921 T39 = FNMS(KP250000000, T38, T37);
Chris@82 922 T2V = T2T - T2U;
Chris@82 923 T2Y = T2W - T2X;
Chris@82 924 T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y);
Chris@82 925 T3c = FNMS(KP587785252, T2V, KP951056516 * T2Y);
Chris@82 926 cr[WS(rs, 5)] = T37 + T38;
Chris@82 927 T3b = T39 - T36;
Chris@82 928 ci[WS(rs, 2)] = T3b - T3c;
Chris@82 929 ci[WS(rs, 6)] = T3c + T3b;
Chris@82 930 T3a = T36 + T39;
Chris@82 931 cr[WS(rs, 1)] = T2Z + T3a;
Chris@82 932 cr[WS(rs, 9)] = T3a - T2Z;
Chris@82 933 }
Chris@82 934 {
Chris@82 935 E T3x, T1Y, T3y, T3G, T3I, T3C, T3F, T3H, T3z;
Chris@82 936 T3x = KP559016994 * (T1U - T1X);
Chris@82 937 T1Y = T1U + T1X;
Chris@82 938 T3y = FNMS(KP250000000, T1Y, T1R);
Chris@82 939 T3C = T3A - T3B;
Chris@82 940 T3F = T3D - T3E;
Chris@82 941 T3G = FMA(KP951056516, T3C, KP587785252 * T3F);
Chris@82 942 T3I = FNMS(KP587785252, T3C, KP951056516 * T3F);
Chris@82 943 cr[0] = T1R + T1Y;
Chris@82 944 T3H = T3y - T3x;
Chris@82 945 ci[WS(rs, 7)] = T3H - T3I;
Chris@82 946 cr[WS(rs, 8)] = T3H + T3I;
Chris@82 947 T3z = T3x + T3y;
Chris@82 948 cr[WS(rs, 4)] = T3z - T3G;
Chris@82 949 ci[WS(rs, 3)] = T3z + T3G;
Chris@82 950 }
Chris@82 951 {
Chris@82 952 E T2l, T2r, T2s, T2Q, T2R, T2E, T2P, T2S, T2t;
Chris@82 953 T2l = KP559016994 * (T29 - T2k);
Chris@82 954 T2r = T29 + T2k;
Chris@82 955 T2s = FNMS(KP250000000, T2r, T2q);
Chris@82 956 T2E = T2y + T2D;
Chris@82 957 T2P = T2J - T2O;
Chris@82 958 T2Q = FMA(KP951056516, T2E, KP587785252 * T2P);
Chris@82 959 T2R = FNMS(KP587785252, T2E, KP951056516 * T2P);
Chris@82 960 ci[WS(rs, 4)] = T2q + T2r;
Chris@82 961 T2S = T2s - T2l;
Chris@82 962 cr[WS(rs, 3)] = T2R + T2S;
Chris@82 963 cr[WS(rs, 7)] = T2S - T2R;
Chris@82 964 T2t = T2l + T2s;
Chris@82 965 ci[0] = T2t - T2Q;
Chris@82 966 ci[WS(rs, 8)] = T2Q + T2t;
Chris@82 967 }
Chris@82 968 {
Chris@82 969 E T3U, T3L, T3V, T3T, T3X, T3R, T3S, T3Y, T3W;
Chris@82 970 T3U = KP559016994 * (T3K + T3J);
Chris@82 971 T3L = T3J - T3K;
Chris@82 972 T3V = FMA(KP250000000, T3L, T3Q);
Chris@82 973 T3R = T13 - TG;
Chris@82 974 T3S = T1r - T1O;
Chris@82 975 T3T = FNMS(KP587785252, T3S, KP951056516 * T3R);
Chris@82 976 T3X = FMA(KP587785252, T3R, KP951056516 * T3S);
Chris@82 977 cr[WS(rs, 10)] = T3L - T3Q;
Chris@82 978 T3Y = T3V - T3U;
Chris@82 979 cr[WS(rs, 18)] = T3X - T3Y;
Chris@82 980 ci[WS(rs, 17)] = T3X + T3Y;
Chris@82 981 T3W = T3U + T3V;
Chris@82 982 cr[WS(rs, 14)] = T3T - T3W;
Chris@82 983 ci[WS(rs, 13)] = T3T + T3W;
Chris@82 984 }
Chris@82 985 {
Chris@82 986 E T4g, T4k, T4l, T4d, T4n, T4b, T4c, T4o, T4m;
Chris@82 987 T4g = KP559016994 * (T4e - T4f);
Chris@82 988 T4k = T4e + T4f;
Chris@82 989 T4l = FNMS(KP250000000, T4k, T4j);
Chris@82 990 T4b = T33 - T34;
Chris@82 991 T4c = T30 - T31;
Chris@82 992 T4d = FNMS(KP587785252, T4c, KP951056516 * T4b);
Chris@82 993 T4n = FMA(KP951056516, T4c, KP587785252 * T4b);
Chris@82 994 ci[WS(rs, 14)] = T4k + T4j;
Chris@82 995 T4o = T4g + T4l;
Chris@82 996 ci[WS(rs, 10)] = T4n + T4o;
Chris@82 997 ci[WS(rs, 18)] = T4o - T4n;
Chris@82 998 T4m = T4g - T4l;
Chris@82 999 cr[WS(rs, 13)] = T4d + T4m;
Chris@82 1000 cr[WS(rs, 17)] = T4m - T4d;
Chris@82 1001 }
Chris@82 1002 {
Chris@82 1003 E T47, T45, T46, T41, T49, T3Z, T40, T4a, T48;
Chris@82 1004 T47 = KP559016994 * (T43 - T44);
Chris@82 1005 T45 = T43 + T44;
Chris@82 1006 T46 = FNMS(KP250000000, T45, T42);
Chris@82 1007 T3Z = T1S - T1T;
Chris@82 1008 T40 = T1V - T1W;
Chris@82 1009 T41 = FNMS(KP951056516, T40, KP587785252 * T3Z);
Chris@82 1010 T49 = FMA(KP951056516, T3Z, KP587785252 * T40);
Chris@82 1011 ci[WS(rs, 19)] = T45 + T42;
Chris@82 1012 T4a = T47 + T46;
Chris@82 1013 cr[WS(rs, 16)] = T49 - T4a;
Chris@82 1014 ci[WS(rs, 15)] = T49 + T4a;
Chris@82 1015 T48 = T46 - T47;
Chris@82 1016 cr[WS(rs, 12)] = T41 - T48;
Chris@82 1017 ci[WS(rs, 11)] = T41 + T48;
Chris@82 1018 }
Chris@82 1019 {
Chris@82 1020 E T4w, T4r, T4x, T4v, T4z, T4t, T4u, T4A, T4y;
Chris@82 1021 T4w = KP559016994 * (T4p + T4q);
Chris@82 1022 T4r = T4p - T4q;
Chris@82 1023 T4x = FMA(KP250000000, T4r, T4s);
Chris@82 1024 T4t = T23 - T28;
Chris@82 1025 T4u = T2e - T2j;
Chris@82 1026 T4v = FMA(KP951056516, T4t, KP587785252 * T4u);
Chris@82 1027 T4z = FNMS(KP587785252, T4t, KP951056516 * T4u);
Chris@82 1028 cr[WS(rs, 15)] = T4r - T4s;
Chris@82 1029 T4A = T4w + T4x;
Chris@82 1030 ci[WS(rs, 12)] = T4z + T4A;
Chris@82 1031 ci[WS(rs, 16)] = T4A - T4z;
Chris@82 1032 T4y = T4w - T4x;
Chris@82 1033 cr[WS(rs, 11)] = T4v + T4y;
Chris@82 1034 cr[WS(rs, 19)] = T4y - T4v;
Chris@82 1035 }
Chris@82 1036 }
Chris@82 1037 }
Chris@82 1038 }
Chris@82 1039
Chris@82 1040 static const tw_instr twinstr[] = {
Chris@82 1041 {TW_FULL, 1, 20},
Chris@82 1042 {TW_NEXT, 1, 0}
Chris@82 1043 };
Chris@82 1044
Chris@82 1045 static const hc2hc_desc desc = { 20, "hf_20", twinstr, &GENUS, {184, 62, 62, 0} };
Chris@82 1046
Chris@82 1047 void X(codelet_hf_20) (planner *p) {
Chris@82 1048 X(khc2hc_register) (p, hf_20, &desc);
Chris@82 1049 }
Chris@82 1050 #endif