annotate src/fftw-3.3.5/rdft/scalar/r2cf/hf_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:46:40 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hf_20 -include hf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 246 FP additions, 148 FP multiplications,
Chris@42 32 * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
Chris@42 33 * 100 stack variables, 4 constants, and 80 memory accesses
Chris@42 34 */
Chris@42 35 #include "hf.h"
Chris@42 36
Chris@42 37 static void hf_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 46 E T54, T5a, T5c, T56, T53, T55, T5b, T57;
Chris@42 47 {
Chris@42 48 E T4N, T4q, T8, T2i, T4r, T2n, T4O, Tl, T2v, T3v, T43, T4b, TN, T2b, T3F;
Chris@42 49 E T3a, T2R, T3z, T3T, T4f, T27, T2f, T3J, T3i, T2K, T3y, T3W, T4e, T1G, T2e;
Chris@42 50 E T3I, T3p, T2C, T3w, T40, T4c, T1e, T2c, T3G, T33;
Chris@42 51 {
Chris@42 52 E T1, T4p, T3, T6, T2, T5;
Chris@42 53 T1 = cr[0];
Chris@42 54 T4p = ci[0];
Chris@42 55 T3 = cr[WS(rs, 10)];
Chris@42 56 T6 = ci[WS(rs, 10)];
Chris@42 57 T2 = W[18];
Chris@42 58 T5 = W[19];
Chris@42 59 {
Chris@42 60 E Ta, Td, Tg, T2j, Tb, Tj, Tf, Tc, Ti;
Chris@42 61 {
Chris@42 62 E T4n, T4, T9, T4o, T7;
Chris@42 63 Ta = cr[WS(rs, 5)];
Chris@42 64 Td = ci[WS(rs, 5)];
Chris@42 65 T4n = T2 * T6;
Chris@42 66 T4 = T2 * T3;
Chris@42 67 T9 = W[8];
Chris@42 68 Tg = cr[WS(rs, 15)];
Chris@42 69 T4o = FNMS(T5, T3, T4n);
Chris@42 70 T7 = FMA(T5, T6, T4);
Chris@42 71 T2j = T9 * Td;
Chris@42 72 Tb = T9 * Ta;
Chris@42 73 T4N = T4p - T4o;
Chris@42 74 T4q = T4o + T4p;
Chris@42 75 T8 = T1 + T7;
Chris@42 76 T2i = T1 - T7;
Chris@42 77 Tj = ci[WS(rs, 15)];
Chris@42 78 Tf = W[28];
Chris@42 79 }
Chris@42 80 Tc = W[9];
Chris@42 81 Ti = W[29];
Chris@42 82 {
Chris@42 83 E T36, Ts, T2t, TL, TB, TE, TD, T38, Ty, T2q, TC;
Chris@42 84 {
Chris@42 85 E TH, TK, TJ, T2s, TI;
Chris@42 86 {
Chris@42 87 E To, Tr, Tp, T35, Tq, TG;
Chris@42 88 {
Chris@42 89 E T2k, Te, T2m, Tk, T2l, Th, Tn;
Chris@42 90 To = cr[WS(rs, 4)];
Chris@42 91 T2l = Tf * Tj;
Chris@42 92 Th = Tf * Tg;
Chris@42 93 T2k = FNMS(Tc, Ta, T2j);
Chris@42 94 Te = FMA(Tc, Td, Tb);
Chris@42 95 T2m = FNMS(Ti, Tg, T2l);
Chris@42 96 Tk = FMA(Ti, Tj, Th);
Chris@42 97 Tr = ci[WS(rs, 4)];
Chris@42 98 Tn = W[6];
Chris@42 99 T4r = T2k + T2m;
Chris@42 100 T2n = T2k - T2m;
Chris@42 101 T4O = Te - Tk;
Chris@42 102 Tl = Te + Tk;
Chris@42 103 Tp = Tn * To;
Chris@42 104 T35 = Tn * Tr;
Chris@42 105 }
Chris@42 106 Tq = W[7];
Chris@42 107 TH = cr[WS(rs, 19)];
Chris@42 108 TK = ci[WS(rs, 19)];
Chris@42 109 TG = W[36];
Chris@42 110 T36 = FNMS(Tq, To, T35);
Chris@42 111 Ts = FMA(Tq, Tr, Tp);
Chris@42 112 TJ = W[37];
Chris@42 113 T2s = TG * TK;
Chris@42 114 TI = TG * TH;
Chris@42 115 }
Chris@42 116 {
Chris@42 117 E Tu, Tx, Tt, Tw, T37, Tv, TA;
Chris@42 118 Tu = cr[WS(rs, 14)];
Chris@42 119 Tx = ci[WS(rs, 14)];
Chris@42 120 T2t = FNMS(TJ, TH, T2s);
Chris@42 121 TL = FMA(TJ, TK, TI);
Chris@42 122 Tt = W[26];
Chris@42 123 Tw = W[27];
Chris@42 124 TB = cr[WS(rs, 9)];
Chris@42 125 TE = ci[WS(rs, 9)];
Chris@42 126 T37 = Tt * Tx;
Chris@42 127 Tv = Tt * Tu;
Chris@42 128 TA = W[16];
Chris@42 129 TD = W[17];
Chris@42 130 T38 = FNMS(Tw, Tu, T37);
Chris@42 131 Ty = FMA(Tw, Tx, Tv);
Chris@42 132 T2q = TA * TE;
Chris@42 133 TC = TA * TB;
Chris@42 134 }
Chris@42 135 }
Chris@42 136 {
Chris@42 137 E T39, T42, Tz, T2p, T2r, TF;
Chris@42 138 T39 = T36 - T38;
Chris@42 139 T42 = T36 + T38;
Chris@42 140 Tz = Ts + Ty;
Chris@42 141 T2p = Ts - Ty;
Chris@42 142 T2r = FNMS(TD, TB, T2q);
Chris@42 143 TF = FMA(TD, TE, TC);
Chris@42 144 {
Chris@42 145 E T41, T2u, TM, T34;
Chris@42 146 T41 = T2r + T2t;
Chris@42 147 T2u = T2r - T2t;
Chris@42 148 TM = TF + TL;
Chris@42 149 T34 = TL - TF;
Chris@42 150 T2v = T2p - T2u;
Chris@42 151 T3v = T2p + T2u;
Chris@42 152 T43 = T41 - T42;
Chris@42 153 T4b = T42 + T41;
Chris@42 154 TN = Tz - TM;
Chris@42 155 T2b = Tz + TM;
Chris@42 156 T3F = T39 + T34;
Chris@42 157 T3a = T34 - T39;
Chris@42 158 }
Chris@42 159 }
Chris@42 160 }
Chris@42 161 }
Chris@42 162 }
Chris@42 163 {
Chris@42 164 E T3e, T1M, T2P, T25, T1V, T1Y, T1X, T3g, T1S, T2M, T1W;
Chris@42 165 {
Chris@42 166 E T21, T24, T23, T2O, T22;
Chris@42 167 {
Chris@42 168 E T1I, T1L, T1H, T1K, T3d, T1J, T20;
Chris@42 169 T1I = cr[WS(rs, 12)];
Chris@42 170 T1L = ci[WS(rs, 12)];
Chris@42 171 T1H = W[22];
Chris@42 172 T1K = W[23];
Chris@42 173 T21 = cr[WS(rs, 7)];
Chris@42 174 T24 = ci[WS(rs, 7)];
Chris@42 175 T3d = T1H * T1L;
Chris@42 176 T1J = T1H * T1I;
Chris@42 177 T20 = W[12];
Chris@42 178 T23 = W[13];
Chris@42 179 T3e = FNMS(T1K, T1I, T3d);
Chris@42 180 T1M = FMA(T1K, T1L, T1J);
Chris@42 181 T2O = T20 * T24;
Chris@42 182 T22 = T20 * T21;
Chris@42 183 }
Chris@42 184 {
Chris@42 185 E T1O, T1R, T1N, T1Q, T3f, T1P, T1U;
Chris@42 186 T1O = cr[WS(rs, 2)];
Chris@42 187 T1R = ci[WS(rs, 2)];
Chris@42 188 T2P = FNMS(T23, T21, T2O);
Chris@42 189 T25 = FMA(T23, T24, T22);
Chris@42 190 T1N = W[2];
Chris@42 191 T1Q = W[3];
Chris@42 192 T1V = cr[WS(rs, 17)];
Chris@42 193 T1Y = ci[WS(rs, 17)];
Chris@42 194 T3f = T1N * T1R;
Chris@42 195 T1P = T1N * T1O;
Chris@42 196 T1U = W[32];
Chris@42 197 T1X = W[33];
Chris@42 198 T3g = FNMS(T1Q, T1O, T3f);
Chris@42 199 T1S = FMA(T1Q, T1R, T1P);
Chris@42 200 T2M = T1U * T1Y;
Chris@42 201 T1W = T1U * T1V;
Chris@42 202 }
Chris@42 203 }
Chris@42 204 {
Chris@42 205 E T3h, T3S, T1T, T2L, T2N, T1Z;
Chris@42 206 T3h = T3e - T3g;
Chris@42 207 T3S = T3e + T3g;
Chris@42 208 T1T = T1M + T1S;
Chris@42 209 T2L = T1M - T1S;
Chris@42 210 T2N = FNMS(T1X, T1V, T2M);
Chris@42 211 T1Z = FMA(T1X, T1Y, T1W);
Chris@42 212 {
Chris@42 213 E T3R, T2Q, T26, T3c;
Chris@42 214 T3R = T2N + T2P;
Chris@42 215 T2Q = T2N - T2P;
Chris@42 216 T26 = T1Z + T25;
Chris@42 217 T3c = T25 - T1Z;
Chris@42 218 T2R = T2L - T2Q;
Chris@42 219 T3z = T2L + T2Q;
Chris@42 220 T3T = T3R - T3S;
Chris@42 221 T4f = T3S + T3R;
Chris@42 222 T27 = T1T - T26;
Chris@42 223 T2f = T1T + T26;
Chris@42 224 T3J = T3h + T3c;
Chris@42 225 T3i = T3c - T3h;
Chris@42 226 }
Chris@42 227 }
Chris@42 228 }
Chris@42 229 {
Chris@42 230 E T3l, T1l, T2I, T1E, T1u, T1x, T1w, T3n, T1r, T2F, T1v;
Chris@42 231 {
Chris@42 232 E T1A, T1D, T1C, T2H, T1B;
Chris@42 233 {
Chris@42 234 E T1h, T1k, T1g, T1j, T3k, T1i, T1z;
Chris@42 235 T1h = cr[WS(rs, 8)];
Chris@42 236 T1k = ci[WS(rs, 8)];
Chris@42 237 T1g = W[14];
Chris@42 238 T1j = W[15];
Chris@42 239 T1A = cr[WS(rs, 3)];
Chris@42 240 T1D = ci[WS(rs, 3)];
Chris@42 241 T3k = T1g * T1k;
Chris@42 242 T1i = T1g * T1h;
Chris@42 243 T1z = W[4];
Chris@42 244 T1C = W[5];
Chris@42 245 T3l = FNMS(T1j, T1h, T3k);
Chris@42 246 T1l = FMA(T1j, T1k, T1i);
Chris@42 247 T2H = T1z * T1D;
Chris@42 248 T1B = T1z * T1A;
Chris@42 249 }
Chris@42 250 {
Chris@42 251 E T1n, T1q, T1m, T1p, T3m, T1o, T1t;
Chris@42 252 T1n = cr[WS(rs, 18)];
Chris@42 253 T1q = ci[WS(rs, 18)];
Chris@42 254 T2I = FNMS(T1C, T1A, T2H);
Chris@42 255 T1E = FMA(T1C, T1D, T1B);
Chris@42 256 T1m = W[34];
Chris@42 257 T1p = W[35];
Chris@42 258 T1u = cr[WS(rs, 13)];
Chris@42 259 T1x = ci[WS(rs, 13)];
Chris@42 260 T3m = T1m * T1q;
Chris@42 261 T1o = T1m * T1n;
Chris@42 262 T1t = W[24];
Chris@42 263 T1w = W[25];
Chris@42 264 T3n = FNMS(T1p, T1n, T3m);
Chris@42 265 T1r = FMA(T1p, T1q, T1o);
Chris@42 266 T2F = T1t * T1x;
Chris@42 267 T1v = T1t * T1u;
Chris@42 268 }
Chris@42 269 }
Chris@42 270 {
Chris@42 271 E T3o, T3V, T1s, T2E, T2G, T1y;
Chris@42 272 T3o = T3l - T3n;
Chris@42 273 T3V = T3l + T3n;
Chris@42 274 T1s = T1l + T1r;
Chris@42 275 T2E = T1l - T1r;
Chris@42 276 T2G = FNMS(T1w, T1u, T2F);
Chris@42 277 T1y = FMA(T1w, T1x, T1v);
Chris@42 278 {
Chris@42 279 E T3U, T2J, T1F, T3j;
Chris@42 280 T3U = T2G + T2I;
Chris@42 281 T2J = T2G - T2I;
Chris@42 282 T1F = T1y + T1E;
Chris@42 283 T3j = T1E - T1y;
Chris@42 284 T2K = T2E - T2J;
Chris@42 285 T3y = T2E + T2J;
Chris@42 286 T3W = T3U - T3V;
Chris@42 287 T4e = T3V + T3U;
Chris@42 288 T1G = T1s - T1F;
Chris@42 289 T2e = T1s + T1F;
Chris@42 290 T3I = T3o + T3j;
Chris@42 291 T3p = T3j - T3o;
Chris@42 292 }
Chris@42 293 }
Chris@42 294 }
Chris@42 295 {
Chris@42 296 E T2Z, TT, T2A, T1c, T12, T15, T14, T31, TZ, T2x, T13;
Chris@42 297 {
Chris@42 298 E T18, T1b, T1a, T2z, T19;
Chris@42 299 {
Chris@42 300 E TP, TS, TO, TR, T2Y, TQ, T17;
Chris@42 301 TP = cr[WS(rs, 16)];
Chris@42 302 TS = ci[WS(rs, 16)];
Chris@42 303 TO = W[30];
Chris@42 304 TR = W[31];
Chris@42 305 T18 = cr[WS(rs, 11)];
Chris@42 306 T1b = ci[WS(rs, 11)];
Chris@42 307 T2Y = TO * TS;
Chris@42 308 TQ = TO * TP;
Chris@42 309 T17 = W[20];
Chris@42 310 T1a = W[21];
Chris@42 311 T2Z = FNMS(TR, TP, T2Y);
Chris@42 312 TT = FMA(TR, TS, TQ);
Chris@42 313 T2z = T17 * T1b;
Chris@42 314 T19 = T17 * T18;
Chris@42 315 }
Chris@42 316 {
Chris@42 317 E TV, TY, TU, TX, T30, TW, T11;
Chris@42 318 TV = cr[WS(rs, 6)];
Chris@42 319 TY = ci[WS(rs, 6)];
Chris@42 320 T2A = FNMS(T1a, T18, T2z);
Chris@42 321 T1c = FMA(T1a, T1b, T19);
Chris@42 322 TU = W[10];
Chris@42 323 TX = W[11];
Chris@42 324 T12 = cr[WS(rs, 1)];
Chris@42 325 T15 = ci[WS(rs, 1)];
Chris@42 326 T30 = TU * TY;
Chris@42 327 TW = TU * TV;
Chris@42 328 T11 = W[0];
Chris@42 329 T14 = W[1];
Chris@42 330 T31 = FNMS(TX, TV, T30);
Chris@42 331 TZ = FMA(TX, TY, TW);
Chris@42 332 T2x = T11 * T15;
Chris@42 333 T13 = T11 * T12;
Chris@42 334 }
Chris@42 335 }
Chris@42 336 {
Chris@42 337 E T32, T3Z, T10, T2w, T2y, T16;
Chris@42 338 T32 = T2Z - T31;
Chris@42 339 T3Z = T2Z + T31;
Chris@42 340 T10 = TT + TZ;
Chris@42 341 T2w = TT - TZ;
Chris@42 342 T2y = FNMS(T14, T12, T2x);
Chris@42 343 T16 = FMA(T14, T15, T13);
Chris@42 344 {
Chris@42 345 E T3Y, T2B, T1d, T2X;
Chris@42 346 T3Y = T2y + T2A;
Chris@42 347 T2B = T2y - T2A;
Chris@42 348 T1d = T16 + T1c;
Chris@42 349 T2X = T1c - T16;
Chris@42 350 T2C = T2w - T2B;
Chris@42 351 T3w = T2w + T2B;
Chris@42 352 T40 = T3Y - T3Z;
Chris@42 353 T4c = T3Z + T3Y;
Chris@42 354 T1e = T10 - T1d;
Chris@42 355 T2c = T10 + T1d;
Chris@42 356 T3G = T32 + T2X;
Chris@42 357 T33 = T2X - T32;
Chris@42 358 }
Chris@42 359 }
Chris@42 360 }
Chris@42 361 {
Chris@42 362 E T4l, T4k, T4w, T4x, T4Q, T4R, T2o, T4X, T4W, T4C, T4D, T4J, T4h, T4j, T4I;
Chris@42 363 E T51, T52, T49, T3r, T3t, T58, T2D, T48, T2S, T59;
Chris@42 364 {
Chris@42 365 E T2a, T47, T45, T3u, T3x, T3N, T3L, T3A, T46, T3Q;
Chris@42 366 {
Chris@42 367 E Tm, T1f, T28, T3X, T44;
Chris@42 368 T4l = T3W + T3T;
Chris@42 369 T3X = T3T - T3W;
Chris@42 370 T44 = T40 - T43;
Chris@42 371 T4k = T43 + T40;
Chris@42 372 T2a = T8 + Tl;
Chris@42 373 Tm = T8 - Tl;
Chris@42 374 T1f = TN + T1e;
Chris@42 375 T4w = T1e - TN;
Chris@42 376 T4x = T1G - T27;
Chris@42 377 T28 = T1G + T27;
Chris@42 378 T47 = FMA(KP618033988, T3X, T44);
Chris@42 379 T45 = FNMS(KP618033988, T44, T3X);
Chris@42 380 {
Chris@42 381 E T3H, T29, T3P, T3K, T3O;
Chris@42 382 T3H = T3F - T3G;
Chris@42 383 T4Q = T3F + T3G;
Chris@42 384 T29 = T1f + T28;
Chris@42 385 T3P = T1f - T28;
Chris@42 386 T4R = T3I + T3J;
Chris@42 387 T3K = T3I - T3J;
Chris@42 388 T3u = T2i + T2n;
Chris@42 389 T2o = T2i - T2n;
Chris@42 390 T4X = T3v - T3w;
Chris@42 391 T3x = T3v + T3w;
Chris@42 392 ci[WS(rs, 9)] = Tm + T29;
Chris@42 393 T3O = FNMS(KP250000000, T29, Tm);
Chris@42 394 T3N = FNMS(KP618033988, T3H, T3K);
Chris@42 395 T3L = FMA(KP618033988, T3K, T3H);
Chris@42 396 T3A = T3y + T3z;
Chris@42 397 T4W = T3y - T3z;
Chris@42 398 T46 = FMA(KP559016994, T3P, T3O);
Chris@42 399 T3Q = FNMS(KP559016994, T3P, T3O);
Chris@42 400 }
Chris@42 401 }
Chris@42 402 {
Chris@42 403 E T2d, T2g, T3b, T3q, T2h;
Chris@42 404 {
Chris@42 405 E T4d, T3D, T3C, T4g, T3B, T3M, T3E;
Chris@42 406 T4C = T4b + T4c;
Chris@42 407 T4d = T4b - T4c;
Chris@42 408 T3D = T3x - T3A;
Chris@42 409 T3B = T3x + T3A;
Chris@42 410 ci[WS(rs, 1)] = FMA(KP951056516, T45, T3Q);
Chris@42 411 cr[WS(rs, 2)] = FNMS(KP951056516, T45, T3Q);
Chris@42 412 cr[WS(rs, 6)] = FMA(KP951056516, T47, T46);
Chris@42 413 ci[WS(rs, 5)] = FNMS(KP951056516, T47, T46);
Chris@42 414 cr[WS(rs, 5)] = T3u + T3B;
Chris@42 415 T3C = FNMS(KP250000000, T3B, T3u);
Chris@42 416 T4g = T4e - T4f;
Chris@42 417 T4D = T4e + T4f;
Chris@42 418 T2d = T2b + T2c;
Chris@42 419 T4J = T2b - T2c;
Chris@42 420 T3M = FNMS(KP559016994, T3D, T3C);
Chris@42 421 T3E = FMA(KP559016994, T3D, T3C);
Chris@42 422 T4h = FMA(KP618033988, T4g, T4d);
Chris@42 423 T4j = FNMS(KP618033988, T4d, T4g);
Chris@42 424 cr[WS(rs, 9)] = FNMS(KP951056516, T3L, T3E);
Chris@42 425 cr[WS(rs, 1)] = FMA(KP951056516, T3L, T3E);
Chris@42 426 ci[WS(rs, 6)] = FMA(KP951056516, T3N, T3M);
Chris@42 427 ci[WS(rs, 2)] = FNMS(KP951056516, T3N, T3M);
Chris@42 428 T4I = T2f - T2e;
Chris@42 429 T2g = T2e + T2f;
Chris@42 430 }
Chris@42 431 T3b = T33 - T3a;
Chris@42 432 T51 = T3a + T33;
Chris@42 433 T52 = T3p + T3i;
Chris@42 434 T3q = T3i - T3p;
Chris@42 435 T2h = T2d + T2g;
Chris@42 436 T49 = T2d - T2g;
Chris@42 437 T3r = FMA(KP618033988, T3q, T3b);
Chris@42 438 T3t = FNMS(KP618033988, T3b, T3q);
Chris@42 439 T58 = T2v - T2C;
Chris@42 440 T2D = T2v + T2C;
Chris@42 441 cr[0] = T2a + T2h;
Chris@42 442 T48 = FNMS(KP250000000, T2h, T2a);
Chris@42 443 T2S = T2K + T2R;
Chris@42 444 T59 = T2K - T2R;
Chris@42 445 }
Chris@42 446 }
Chris@42 447 {
Chris@42 448 E T4B, T4P, T4Y, T50, T4U, T4S;
Chris@42 449 {
Chris@42 450 E T4A, T4y, T4s, T4m, T4u, T4t, T4z, T4v;
Chris@42 451 {
Chris@42 452 E T2V, T2U, T4i, T4a, T2T, T2W, T3s;
Chris@42 453 T4i = FNMS(KP559016994, T49, T48);
Chris@42 454 T4a = FMA(KP559016994, T49, T48);
Chris@42 455 T2T = T2D + T2S;
Chris@42 456 T2V = T2D - T2S;
Chris@42 457 ci[WS(rs, 3)] = FMA(KP951056516, T4h, T4a);
Chris@42 458 cr[WS(rs, 4)] = FNMS(KP951056516, T4h, T4a);
Chris@42 459 cr[WS(rs, 8)] = FMA(KP951056516, T4j, T4i);
Chris@42 460 ci[WS(rs, 7)] = FNMS(KP951056516, T4j, T4i);
Chris@42 461 ci[WS(rs, 4)] = T2o + T2T;
Chris@42 462 T2U = FNMS(KP250000000, T2T, T2o);
Chris@42 463 T4A = FMA(KP618033988, T4w, T4x);
Chris@42 464 T4y = FNMS(KP618033988, T4x, T4w);
Chris@42 465 T4B = T4r + T4q;
Chris@42 466 T4s = T4q - T4r;
Chris@42 467 T2W = FMA(KP559016994, T2V, T2U);
Chris@42 468 T3s = FNMS(KP559016994, T2V, T2U);
Chris@42 469 ci[WS(rs, 8)] = FMA(KP951056516, T3r, T2W);
Chris@42 470 ci[0] = FNMS(KP951056516, T3r, T2W);
Chris@42 471 cr[WS(rs, 7)] = FNMS(KP951056516, T3t, T3s);
Chris@42 472 cr[WS(rs, 3)] = FMA(KP951056516, T3t, T3s);
Chris@42 473 T4m = T4k + T4l;
Chris@42 474 T4u = T4l - T4k;
Chris@42 475 }
Chris@42 476 cr[WS(rs, 10)] = T4m - T4s;
Chris@42 477 T4t = FMA(KP250000000, T4m, T4s);
Chris@42 478 T4P = T4N - T4O;
Chris@42 479 T54 = T4O + T4N;
Chris@42 480 T4Y = FNMS(KP618033988, T4X, T4W);
Chris@42 481 T50 = FMA(KP618033988, T4W, T4X);
Chris@42 482 T4z = FNMS(KP559016994, T4u, T4t);
Chris@42 483 T4v = FMA(KP559016994, T4u, T4t);
Chris@42 484 ci[WS(rs, 13)] = FMA(KP951056516, T4y, T4v);
Chris@42 485 cr[WS(rs, 14)] = FMS(KP951056516, T4y, T4v);
Chris@42 486 ci[WS(rs, 17)] = FMA(KP951056516, T4A, T4z);
Chris@42 487 cr[WS(rs, 18)] = FMS(KP951056516, T4A, T4z);
Chris@42 488 T4U = T4Q - T4R;
Chris@42 489 T4S = T4Q + T4R;
Chris@42 490 }
Chris@42 491 {
Chris@42 492 E T4M, T4K, T4E, T4G, T4T, T4V, T4Z, T4F, T4L, T4H;
Chris@42 493 ci[WS(rs, 14)] = T4S + T4P;
Chris@42 494 T4T = FNMS(KP250000000, T4S, T4P);
Chris@42 495 T4M = FNMS(KP618033988, T4I, T4J);
Chris@42 496 T4K = FMA(KP618033988, T4J, T4I);
Chris@42 497 T4V = FNMS(KP559016994, T4U, T4T);
Chris@42 498 T4Z = FMA(KP559016994, T4U, T4T);
Chris@42 499 cr[WS(rs, 17)] = -(FMA(KP951056516, T4Y, T4V));
Chris@42 500 cr[WS(rs, 13)] = FMS(KP951056516, T4Y, T4V);
Chris@42 501 ci[WS(rs, 18)] = FNMS(KP951056516, T50, T4Z);
Chris@42 502 ci[WS(rs, 10)] = FMA(KP951056516, T50, T4Z);
Chris@42 503 T4E = T4C + T4D;
Chris@42 504 T4G = T4C - T4D;
Chris@42 505 ci[WS(rs, 19)] = T4E + T4B;
Chris@42 506 T4F = FNMS(KP250000000, T4E, T4B);
Chris@42 507 T5a = FMA(KP618033988, T59, T58);
Chris@42 508 T5c = FNMS(KP618033988, T58, T59);
Chris@42 509 T4L = FMA(KP559016994, T4G, T4F);
Chris@42 510 T4H = FNMS(KP559016994, T4G, T4F);
Chris@42 511 ci[WS(rs, 11)] = FMA(KP951056516, T4K, T4H);
Chris@42 512 cr[WS(rs, 12)] = FMS(KP951056516, T4K, T4H);
Chris@42 513 ci[WS(rs, 15)] = FMA(KP951056516, T4M, T4L);
Chris@42 514 cr[WS(rs, 16)] = FMS(KP951056516, T4M, T4L);
Chris@42 515 T56 = T51 - T52;
Chris@42 516 T53 = T51 + T52;
Chris@42 517 }
Chris@42 518 }
Chris@42 519 }
Chris@42 520 }
Chris@42 521 cr[WS(rs, 15)] = T53 - T54;
Chris@42 522 T55 = FMA(KP250000000, T53, T54);
Chris@42 523 T5b = FMA(KP559016994, T56, T55);
Chris@42 524 T57 = FNMS(KP559016994, T56, T55);
Chris@42 525 cr[WS(rs, 19)] = -(FMA(KP951056516, T5a, T57));
Chris@42 526 cr[WS(rs, 11)] = FMS(KP951056516, T5a, T57);
Chris@42 527 ci[WS(rs, 16)] = FNMS(KP951056516, T5c, T5b);
Chris@42 528 ci[WS(rs, 12)] = FMA(KP951056516, T5c, T5b);
Chris@42 529 }
Chris@42 530 }
Chris@42 531 }
Chris@42 532
Chris@42 533 static const tw_instr twinstr[] = {
Chris@42 534 {TW_FULL, 1, 20},
Chris@42 535 {TW_NEXT, 1, 0}
Chris@42 536 };
Chris@42 537
Chris@42 538 static const hc2hc_desc desc = { 20, "hf_20", twinstr, &GENUS, {136, 38, 110, 0} };
Chris@42 539
Chris@42 540 void X(codelet_hf_20) (planner *p) {
Chris@42 541 X(khc2hc_register) (p, hf_20, &desc);
Chris@42 542 }
Chris@42 543 #else /* HAVE_FMA */
Chris@42 544
Chris@42 545 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hf_20 -include hf.h */
Chris@42 546
Chris@42 547 /*
Chris@42 548 * This function contains 246 FP additions, 124 FP multiplications,
Chris@42 549 * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
Chris@42 550 * 85 stack variables, 4 constants, and 80 memory accesses
Chris@42 551 */
Chris@42 552 #include "hf.h"
Chris@42 553
Chris@42 554 static void hf_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 555 {
Chris@42 556 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 557 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 558 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 559 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 560 {
Chris@42 561 INT m;
Chris@42 562 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 563 E Tj, T1R, T4j, T4s, T2q, T37, T3Q, T42, T1r, T1O, T1P, T3i, T3l, T3J, T3D;
Chris@42 564 E T3E, T44, T1V, T1W, T1X, T2e, T2j, T2k, T2W, T2X, T4f, T33, T34, T35, T2J;
Chris@42 565 E T2O, T4q, TG, T13, T14, T3p, T3s, T3K, T3A, T3B, T43, T1S, T1T, T1U, T23;
Chris@42 566 E T28, T29, T2T, T2U, T4e, T30, T31, T32, T2y, T2D, T4p;
Chris@42 567 {
Chris@42 568 E T1, T3N, T6, T3M, Tc, T2n, Th, T2o;
Chris@42 569 T1 = cr[0];
Chris@42 570 T3N = ci[0];
Chris@42 571 {
Chris@42 572 E T3, T5, T2, T4;
Chris@42 573 T3 = cr[WS(rs, 10)];
Chris@42 574 T5 = ci[WS(rs, 10)];
Chris@42 575 T2 = W[18];
Chris@42 576 T4 = W[19];
Chris@42 577 T6 = FMA(T2, T3, T4 * T5);
Chris@42 578 T3M = FNMS(T4, T3, T2 * T5);
Chris@42 579 }
Chris@42 580 {
Chris@42 581 E T9, Tb, T8, Ta;
Chris@42 582 T9 = cr[WS(rs, 5)];
Chris@42 583 Tb = ci[WS(rs, 5)];
Chris@42 584 T8 = W[8];
Chris@42 585 Ta = W[9];
Chris@42 586 Tc = FMA(T8, T9, Ta * Tb);
Chris@42 587 T2n = FNMS(Ta, T9, T8 * Tb);
Chris@42 588 }
Chris@42 589 {
Chris@42 590 E Te, Tg, Td, Tf;
Chris@42 591 Te = cr[WS(rs, 15)];
Chris@42 592 Tg = ci[WS(rs, 15)];
Chris@42 593 Td = W[28];
Chris@42 594 Tf = W[29];
Chris@42 595 Th = FMA(Td, Te, Tf * Tg);
Chris@42 596 T2o = FNMS(Tf, Te, Td * Tg);
Chris@42 597 }
Chris@42 598 {
Chris@42 599 E T7, Ti, T4h, T4i;
Chris@42 600 T7 = T1 + T6;
Chris@42 601 Ti = Tc + Th;
Chris@42 602 Tj = T7 - Ti;
Chris@42 603 T1R = T7 + Ti;
Chris@42 604 T4h = T3N - T3M;
Chris@42 605 T4i = Tc - Th;
Chris@42 606 T4j = T4h - T4i;
Chris@42 607 T4s = T4i + T4h;
Chris@42 608 }
Chris@42 609 {
Chris@42 610 E T2m, T2p, T3O, T3P;
Chris@42 611 T2m = T1 - T6;
Chris@42 612 T2p = T2n - T2o;
Chris@42 613 T2q = T2m - T2p;
Chris@42 614 T37 = T2m + T2p;
Chris@42 615 T3O = T3M + T3N;
Chris@42 616 T3P = T2n + T2o;
Chris@42 617 T3Q = T3O - T3P;
Chris@42 618 T42 = T3P + T3O;
Chris@42 619 }
Chris@42 620 }
Chris@42 621 {
Chris@42 622 E T1f, T3g, T2a, T2H, T1N, T3j, T2i, T2N, T1q, T3h, T2d, T2I, T1C, T3k, T2f;
Chris@42 623 E T2M;
Chris@42 624 {
Chris@42 625 E T19, T2F, T1e, T2G;
Chris@42 626 {
Chris@42 627 E T16, T18, T15, T17;
Chris@42 628 T16 = cr[WS(rs, 8)];
Chris@42 629 T18 = ci[WS(rs, 8)];
Chris@42 630 T15 = W[14];
Chris@42 631 T17 = W[15];
Chris@42 632 T19 = FMA(T15, T16, T17 * T18);
Chris@42 633 T2F = FNMS(T17, T16, T15 * T18);
Chris@42 634 }
Chris@42 635 {
Chris@42 636 E T1b, T1d, T1a, T1c;
Chris@42 637 T1b = cr[WS(rs, 18)];
Chris@42 638 T1d = ci[WS(rs, 18)];
Chris@42 639 T1a = W[34];
Chris@42 640 T1c = W[35];
Chris@42 641 T1e = FMA(T1a, T1b, T1c * T1d);
Chris@42 642 T2G = FNMS(T1c, T1b, T1a * T1d);
Chris@42 643 }
Chris@42 644 T1f = T19 + T1e;
Chris@42 645 T3g = T2F + T2G;
Chris@42 646 T2a = T19 - T1e;
Chris@42 647 T2H = T2F - T2G;
Chris@42 648 }
Chris@42 649 {
Chris@42 650 E T1H, T2g, T1M, T2h;
Chris@42 651 {
Chris@42 652 E T1E, T1G, T1D, T1F;
Chris@42 653 T1E = cr[WS(rs, 17)];
Chris@42 654 T1G = ci[WS(rs, 17)];
Chris@42 655 T1D = W[32];
Chris@42 656 T1F = W[33];
Chris@42 657 T1H = FMA(T1D, T1E, T1F * T1G);
Chris@42 658 T2g = FNMS(T1F, T1E, T1D * T1G);
Chris@42 659 }
Chris@42 660 {
Chris@42 661 E T1J, T1L, T1I, T1K;
Chris@42 662 T1J = cr[WS(rs, 7)];
Chris@42 663 T1L = ci[WS(rs, 7)];
Chris@42 664 T1I = W[12];
Chris@42 665 T1K = W[13];
Chris@42 666 T1M = FMA(T1I, T1J, T1K * T1L);
Chris@42 667 T2h = FNMS(T1K, T1J, T1I * T1L);
Chris@42 668 }
Chris@42 669 T1N = T1H + T1M;
Chris@42 670 T3j = T2g + T2h;
Chris@42 671 T2i = T2g - T2h;
Chris@42 672 T2N = T1H - T1M;
Chris@42 673 }
Chris@42 674 {
Chris@42 675 E T1k, T2b, T1p, T2c;
Chris@42 676 {
Chris@42 677 E T1h, T1j, T1g, T1i;
Chris@42 678 T1h = cr[WS(rs, 13)];
Chris@42 679 T1j = ci[WS(rs, 13)];
Chris@42 680 T1g = W[24];
Chris@42 681 T1i = W[25];
Chris@42 682 T1k = FMA(T1g, T1h, T1i * T1j);
Chris@42 683 T2b = FNMS(T1i, T1h, T1g * T1j);
Chris@42 684 }
Chris@42 685 {
Chris@42 686 E T1m, T1o, T1l, T1n;
Chris@42 687 T1m = cr[WS(rs, 3)];
Chris@42 688 T1o = ci[WS(rs, 3)];
Chris@42 689 T1l = W[4];
Chris@42 690 T1n = W[5];
Chris@42 691 T1p = FMA(T1l, T1m, T1n * T1o);
Chris@42 692 T2c = FNMS(T1n, T1m, T1l * T1o);
Chris@42 693 }
Chris@42 694 T1q = T1k + T1p;
Chris@42 695 T3h = T2b + T2c;
Chris@42 696 T2d = T2b - T2c;
Chris@42 697 T2I = T1k - T1p;
Chris@42 698 }
Chris@42 699 {
Chris@42 700 E T1w, T2K, T1B, T2L;
Chris@42 701 {
Chris@42 702 E T1t, T1v, T1s, T1u;
Chris@42 703 T1t = cr[WS(rs, 12)];
Chris@42 704 T1v = ci[WS(rs, 12)];
Chris@42 705 T1s = W[22];
Chris@42 706 T1u = W[23];
Chris@42 707 T1w = FMA(T1s, T1t, T1u * T1v);
Chris@42 708 T2K = FNMS(T1u, T1t, T1s * T1v);
Chris@42 709 }
Chris@42 710 {
Chris@42 711 E T1y, T1A, T1x, T1z;
Chris@42 712 T1y = cr[WS(rs, 2)];
Chris@42 713 T1A = ci[WS(rs, 2)];
Chris@42 714 T1x = W[2];
Chris@42 715 T1z = W[3];
Chris@42 716 T1B = FMA(T1x, T1y, T1z * T1A);
Chris@42 717 T2L = FNMS(T1z, T1y, T1x * T1A);
Chris@42 718 }
Chris@42 719 T1C = T1w + T1B;
Chris@42 720 T3k = T2K + T2L;
Chris@42 721 T2f = T1w - T1B;
Chris@42 722 T2M = T2K - T2L;
Chris@42 723 }
Chris@42 724 T1r = T1f - T1q;
Chris@42 725 T1O = T1C - T1N;
Chris@42 726 T1P = T1r + T1O;
Chris@42 727 T3i = T3g - T3h;
Chris@42 728 T3l = T3j - T3k;
Chris@42 729 T3J = T3l - T3i;
Chris@42 730 T3D = T3g + T3h;
Chris@42 731 T3E = T3k + T3j;
Chris@42 732 T44 = T3D + T3E;
Chris@42 733 T1V = T1f + T1q;
Chris@42 734 T1W = T1C + T1N;
Chris@42 735 T1X = T1V + T1W;
Chris@42 736 T2e = T2a - T2d;
Chris@42 737 T2j = T2f - T2i;
Chris@42 738 T2k = T2e + T2j;
Chris@42 739 T2W = T2H - T2I;
Chris@42 740 T2X = T2M - T2N;
Chris@42 741 T4f = T2W + T2X;
Chris@42 742 T33 = T2a + T2d;
Chris@42 743 T34 = T2f + T2i;
Chris@42 744 T35 = T33 + T34;
Chris@42 745 T2J = T2H + T2I;
Chris@42 746 T2O = T2M + T2N;
Chris@42 747 T4q = T2J + T2O;
Chris@42 748 }
Chris@42 749 {
Chris@42 750 E Tu, T3n, T1Z, T2w, T12, T3r, T27, T2z, TF, T3o, T22, T2x, TR, T3q, T24;
Chris@42 751 E T2C;
Chris@42 752 {
Chris@42 753 E To, T2u, Tt, T2v;
Chris@42 754 {
Chris@42 755 E Tl, Tn, Tk, Tm;
Chris@42 756 Tl = cr[WS(rs, 4)];
Chris@42 757 Tn = ci[WS(rs, 4)];
Chris@42 758 Tk = W[6];
Chris@42 759 Tm = W[7];
Chris@42 760 To = FMA(Tk, Tl, Tm * Tn);
Chris@42 761 T2u = FNMS(Tm, Tl, Tk * Tn);
Chris@42 762 }
Chris@42 763 {
Chris@42 764 E Tq, Ts, Tp, Tr;
Chris@42 765 Tq = cr[WS(rs, 14)];
Chris@42 766 Ts = ci[WS(rs, 14)];
Chris@42 767 Tp = W[26];
Chris@42 768 Tr = W[27];
Chris@42 769 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@42 770 T2v = FNMS(Tr, Tq, Tp * Ts);
Chris@42 771 }
Chris@42 772 Tu = To + Tt;
Chris@42 773 T3n = T2u + T2v;
Chris@42 774 T1Z = To - Tt;
Chris@42 775 T2w = T2u - T2v;
Chris@42 776 }
Chris@42 777 {
Chris@42 778 E TW, T25, T11, T26;
Chris@42 779 {
Chris@42 780 E TT, TV, TS, TU;
Chris@42 781 TT = cr[WS(rs, 1)];
Chris@42 782 TV = ci[WS(rs, 1)];
Chris@42 783 TS = W[0];
Chris@42 784 TU = W[1];
Chris@42 785 TW = FMA(TS, TT, TU * TV);
Chris@42 786 T25 = FNMS(TU, TT, TS * TV);
Chris@42 787 }
Chris@42 788 {
Chris@42 789 E TY, T10, TX, TZ;
Chris@42 790 TY = cr[WS(rs, 11)];
Chris@42 791 T10 = ci[WS(rs, 11)];
Chris@42 792 TX = W[20];
Chris@42 793 TZ = W[21];
Chris@42 794 T11 = FMA(TX, TY, TZ * T10);
Chris@42 795 T26 = FNMS(TZ, TY, TX * T10);
Chris@42 796 }
Chris@42 797 T12 = TW + T11;
Chris@42 798 T3r = T25 + T26;
Chris@42 799 T27 = T25 - T26;
Chris@42 800 T2z = T11 - TW;
Chris@42 801 }
Chris@42 802 {
Chris@42 803 E Tz, T20, TE, T21;
Chris@42 804 {
Chris@42 805 E Tw, Ty, Tv, Tx;
Chris@42 806 Tw = cr[WS(rs, 9)];
Chris@42 807 Ty = ci[WS(rs, 9)];
Chris@42 808 Tv = W[16];
Chris@42 809 Tx = W[17];
Chris@42 810 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@42 811 T20 = FNMS(Tx, Tw, Tv * Ty);
Chris@42 812 }
Chris@42 813 {
Chris@42 814 E TB, TD, TA, TC;
Chris@42 815 TB = cr[WS(rs, 19)];
Chris@42 816 TD = ci[WS(rs, 19)];
Chris@42 817 TA = W[36];
Chris@42 818 TC = W[37];
Chris@42 819 TE = FMA(TA, TB, TC * TD);
Chris@42 820 T21 = FNMS(TC, TB, TA * TD);
Chris@42 821 }
Chris@42 822 TF = Tz + TE;
Chris@42 823 T3o = T20 + T21;
Chris@42 824 T22 = T20 - T21;
Chris@42 825 T2x = Tz - TE;
Chris@42 826 }
Chris@42 827 {
Chris@42 828 E TL, T2A, TQ, T2B;
Chris@42 829 {
Chris@42 830 E TI, TK, TH, TJ;
Chris@42 831 TI = cr[WS(rs, 16)];
Chris@42 832 TK = ci[WS(rs, 16)];
Chris@42 833 TH = W[30];
Chris@42 834 TJ = W[31];
Chris@42 835 TL = FMA(TH, TI, TJ * TK);
Chris@42 836 T2A = FNMS(TJ, TI, TH * TK);
Chris@42 837 }
Chris@42 838 {
Chris@42 839 E TN, TP, TM, TO;
Chris@42 840 TN = cr[WS(rs, 6)];
Chris@42 841 TP = ci[WS(rs, 6)];
Chris@42 842 TM = W[10];
Chris@42 843 TO = W[11];
Chris@42 844 TQ = FMA(TM, TN, TO * TP);
Chris@42 845 T2B = FNMS(TO, TN, TM * TP);
Chris@42 846 }
Chris@42 847 TR = TL + TQ;
Chris@42 848 T3q = T2A + T2B;
Chris@42 849 T24 = TL - TQ;
Chris@42 850 T2C = T2A - T2B;
Chris@42 851 }
Chris@42 852 TG = Tu - TF;
Chris@42 853 T13 = TR - T12;
Chris@42 854 T14 = TG + T13;
Chris@42 855 T3p = T3n - T3o;
Chris@42 856 T3s = T3q - T3r;
Chris@42 857 T3K = T3p + T3s;
Chris@42 858 T3A = T3n + T3o;
Chris@42 859 T3B = T3q + T3r;
Chris@42 860 T43 = T3A + T3B;
Chris@42 861 T1S = Tu + TF;
Chris@42 862 T1T = TR + T12;
Chris@42 863 T1U = T1S + T1T;
Chris@42 864 T23 = T1Z - T22;
Chris@42 865 T28 = T24 - T27;
Chris@42 866 T29 = T23 + T28;
Chris@42 867 T2T = T2w - T2x;
Chris@42 868 T2U = T2C + T2z;
Chris@42 869 T4e = T2T + T2U;
Chris@42 870 T30 = T1Z + T22;
Chris@42 871 T31 = T24 + T27;
Chris@42 872 T32 = T30 + T31;
Chris@42 873 T2y = T2w + T2x;
Chris@42 874 T2D = T2z - T2C;
Chris@42 875 T4p = T2D - T2y;
Chris@42 876 }
Chris@42 877 {
Chris@42 878 E T3e, T1Q, T3d, T3u, T3w, T3m, T3t, T3v, T3f;
Chris@42 879 T3e = KP559016994 * (T14 - T1P);
Chris@42 880 T1Q = T14 + T1P;
Chris@42 881 T3d = FNMS(KP250000000, T1Q, Tj);
Chris@42 882 T3m = T3i + T3l;
Chris@42 883 T3t = T3p - T3s;
Chris@42 884 T3u = FNMS(KP587785252, T3t, KP951056516 * T3m);
Chris@42 885 T3w = FMA(KP951056516, T3t, KP587785252 * T3m);
Chris@42 886 ci[WS(rs, 9)] = Tj + T1Q;
Chris@42 887 T3v = T3e + T3d;
Chris@42 888 ci[WS(rs, 5)] = T3v - T3w;
Chris@42 889 cr[WS(rs, 6)] = T3v + T3w;
Chris@42 890 T3f = T3d - T3e;
Chris@42 891 cr[WS(rs, 2)] = T3f - T3u;
Chris@42 892 ci[WS(rs, 1)] = T3f + T3u;
Chris@42 893 }
Chris@42 894 {
Chris@42 895 E T36, T38, T39, T2Z, T3c, T2V, T2Y, T3b, T3a;
Chris@42 896 T36 = KP559016994 * (T32 - T35);
Chris@42 897 T38 = T32 + T35;
Chris@42 898 T39 = FNMS(KP250000000, T38, T37);
Chris@42 899 T2V = T2T - T2U;
Chris@42 900 T2Y = T2W - T2X;
Chris@42 901 T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y);
Chris@42 902 T3c = FNMS(KP587785252, T2V, KP951056516 * T2Y);
Chris@42 903 cr[WS(rs, 5)] = T37 + T38;
Chris@42 904 T3b = T39 - T36;
Chris@42 905 ci[WS(rs, 2)] = T3b - T3c;
Chris@42 906 ci[WS(rs, 6)] = T3c + T3b;
Chris@42 907 T3a = T36 + T39;
Chris@42 908 cr[WS(rs, 1)] = T2Z + T3a;
Chris@42 909 cr[WS(rs, 9)] = T3a - T2Z;
Chris@42 910 }
Chris@42 911 {
Chris@42 912 E T3x, T1Y, T3y, T3G, T3I, T3C, T3F, T3H, T3z;
Chris@42 913 T3x = KP559016994 * (T1U - T1X);
Chris@42 914 T1Y = T1U + T1X;
Chris@42 915 T3y = FNMS(KP250000000, T1Y, T1R);
Chris@42 916 T3C = T3A - T3B;
Chris@42 917 T3F = T3D - T3E;
Chris@42 918 T3G = FMA(KP951056516, T3C, KP587785252 * T3F);
Chris@42 919 T3I = FNMS(KP587785252, T3C, KP951056516 * T3F);
Chris@42 920 cr[0] = T1R + T1Y;
Chris@42 921 T3H = T3y - T3x;
Chris@42 922 ci[WS(rs, 7)] = T3H - T3I;
Chris@42 923 cr[WS(rs, 8)] = T3H + T3I;
Chris@42 924 T3z = T3x + T3y;
Chris@42 925 cr[WS(rs, 4)] = T3z - T3G;
Chris@42 926 ci[WS(rs, 3)] = T3z + T3G;
Chris@42 927 }
Chris@42 928 {
Chris@42 929 E T2l, T2r, T2s, T2Q, T2R, T2E, T2P, T2S, T2t;
Chris@42 930 T2l = KP559016994 * (T29 - T2k);
Chris@42 931 T2r = T29 + T2k;
Chris@42 932 T2s = FNMS(KP250000000, T2r, T2q);
Chris@42 933 T2E = T2y + T2D;
Chris@42 934 T2P = T2J - T2O;
Chris@42 935 T2Q = FMA(KP951056516, T2E, KP587785252 * T2P);
Chris@42 936 T2R = FNMS(KP587785252, T2E, KP951056516 * T2P);
Chris@42 937 ci[WS(rs, 4)] = T2q + T2r;
Chris@42 938 T2S = T2s - T2l;
Chris@42 939 cr[WS(rs, 3)] = T2R + T2S;
Chris@42 940 cr[WS(rs, 7)] = T2S - T2R;
Chris@42 941 T2t = T2l + T2s;
Chris@42 942 ci[0] = T2t - T2Q;
Chris@42 943 ci[WS(rs, 8)] = T2Q + T2t;
Chris@42 944 }
Chris@42 945 {
Chris@42 946 E T3U, T3L, T3V, T3T, T3X, T3R, T3S, T3Y, T3W;
Chris@42 947 T3U = KP559016994 * (T3K + T3J);
Chris@42 948 T3L = T3J - T3K;
Chris@42 949 T3V = FMA(KP250000000, T3L, T3Q);
Chris@42 950 T3R = T13 - TG;
Chris@42 951 T3S = T1r - T1O;
Chris@42 952 T3T = FNMS(KP587785252, T3S, KP951056516 * T3R);
Chris@42 953 T3X = FMA(KP587785252, T3R, KP951056516 * T3S);
Chris@42 954 cr[WS(rs, 10)] = T3L - T3Q;
Chris@42 955 T3Y = T3V - T3U;
Chris@42 956 cr[WS(rs, 18)] = T3X - T3Y;
Chris@42 957 ci[WS(rs, 17)] = T3X + T3Y;
Chris@42 958 T3W = T3U + T3V;
Chris@42 959 cr[WS(rs, 14)] = T3T - T3W;
Chris@42 960 ci[WS(rs, 13)] = T3T + T3W;
Chris@42 961 }
Chris@42 962 {
Chris@42 963 E T4g, T4k, T4l, T4d, T4n, T4b, T4c, T4o, T4m;
Chris@42 964 T4g = KP559016994 * (T4e - T4f);
Chris@42 965 T4k = T4e + T4f;
Chris@42 966 T4l = FNMS(KP250000000, T4k, T4j);
Chris@42 967 T4b = T33 - T34;
Chris@42 968 T4c = T30 - T31;
Chris@42 969 T4d = FNMS(KP587785252, T4c, KP951056516 * T4b);
Chris@42 970 T4n = FMA(KP951056516, T4c, KP587785252 * T4b);
Chris@42 971 ci[WS(rs, 14)] = T4k + T4j;
Chris@42 972 T4o = T4g + T4l;
Chris@42 973 ci[WS(rs, 10)] = T4n + T4o;
Chris@42 974 ci[WS(rs, 18)] = T4o - T4n;
Chris@42 975 T4m = T4g - T4l;
Chris@42 976 cr[WS(rs, 13)] = T4d + T4m;
Chris@42 977 cr[WS(rs, 17)] = T4m - T4d;
Chris@42 978 }
Chris@42 979 {
Chris@42 980 E T47, T45, T46, T41, T49, T3Z, T40, T4a, T48;
Chris@42 981 T47 = KP559016994 * (T43 - T44);
Chris@42 982 T45 = T43 + T44;
Chris@42 983 T46 = FNMS(KP250000000, T45, T42);
Chris@42 984 T3Z = T1S - T1T;
Chris@42 985 T40 = T1V - T1W;
Chris@42 986 T41 = FNMS(KP951056516, T40, KP587785252 * T3Z);
Chris@42 987 T49 = FMA(KP951056516, T3Z, KP587785252 * T40);
Chris@42 988 ci[WS(rs, 19)] = T45 + T42;
Chris@42 989 T4a = T47 + T46;
Chris@42 990 cr[WS(rs, 16)] = T49 - T4a;
Chris@42 991 ci[WS(rs, 15)] = T49 + T4a;
Chris@42 992 T48 = T46 - T47;
Chris@42 993 cr[WS(rs, 12)] = T41 - T48;
Chris@42 994 ci[WS(rs, 11)] = T41 + T48;
Chris@42 995 }
Chris@42 996 {
Chris@42 997 E T4w, T4r, T4x, T4v, T4z, T4t, T4u, T4A, T4y;
Chris@42 998 T4w = KP559016994 * (T4p + T4q);
Chris@42 999 T4r = T4p - T4q;
Chris@42 1000 T4x = FMA(KP250000000, T4r, T4s);
Chris@42 1001 T4t = T23 - T28;
Chris@42 1002 T4u = T2e - T2j;
Chris@42 1003 T4v = FMA(KP951056516, T4t, KP587785252 * T4u);
Chris@42 1004 T4z = FNMS(KP587785252, T4t, KP951056516 * T4u);
Chris@42 1005 cr[WS(rs, 15)] = T4r - T4s;
Chris@42 1006 T4A = T4w + T4x;
Chris@42 1007 ci[WS(rs, 12)] = T4z + T4A;
Chris@42 1008 ci[WS(rs, 16)] = T4A - T4z;
Chris@42 1009 T4y = T4w - T4x;
Chris@42 1010 cr[WS(rs, 11)] = T4v + T4y;
Chris@42 1011 cr[WS(rs, 19)] = T4y - T4v;
Chris@42 1012 }
Chris@42 1013 }
Chris@42 1014 }
Chris@42 1015 }
Chris@42 1016
Chris@42 1017 static const tw_instr twinstr[] = {
Chris@42 1018 {TW_FULL, 1, 20},
Chris@42 1019 {TW_NEXT, 1, 0}
Chris@42 1020 };
Chris@42 1021
Chris@42 1022 static const hc2hc_desc desc = { 20, "hf_20", twinstr, &GENUS, {184, 62, 62, 0} };
Chris@42 1023
Chris@42 1024 void X(codelet_hf_20) (planner *p) {
Chris@42 1025 X(khc2hc_register) (p, hf_20, &desc);
Chris@42 1026 }
Chris@42 1027 #endif /* HAVE_FMA */