annotate src/fftw-3.3.5/rdft/scalar/r2cf/hf2_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:47:00 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -dit -name hf2_20 -include hf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 276 FP additions, 198 FP multiplications,
Chris@42 32 * (or, 136 additions, 58 multiplications, 140 fused multiply/add),
Chris@42 33 * 146 stack variables, 4 constants, and 80 memory accesses
Chris@42 34 */
Chris@42 35 #include "hf.h"
Chris@42 36
Chris@42 37 static void hf2_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 46 E T5o, T5u, T5w, T5q, T5n, T5p, T5v, T5r;
Chris@42 47 {
Chris@42 48 E T2, Th, Tf, T6, T5, Tl, T1p, T1n, Ti, T3, Tt, Tv, T24, T1f, T1D;
Chris@42 49 E Tb, T1P, Tm, T21, T1b, T7, T1A, Tw, T1H, T13, TA, T1L, T17, T1S, Tq;
Chris@42 50 E T1o, T2g, T1t, T2c, TO, TK;
Chris@42 51 {
Chris@42 52 E T1e, Ta, Tk, Tg;
Chris@42 53 T2 = W[0];
Chris@42 54 Th = W[3];
Chris@42 55 Tf = W[2];
Chris@42 56 T6 = W[5];
Chris@42 57 T5 = W[1];
Chris@42 58 Tk = T2 * Th;
Chris@42 59 Tg = T2 * Tf;
Chris@42 60 T1e = Tf * T6;
Chris@42 61 Ta = T2 * T6;
Chris@42 62 Tl = FMA(T5, Tf, Tk);
Chris@42 63 T1p = FNMS(T5, Tf, Tk);
Chris@42 64 T1n = FMA(T5, Th, Tg);
Chris@42 65 Ti = FNMS(T5, Th, Tg);
Chris@42 66 T3 = W[4];
Chris@42 67 Tt = W[6];
Chris@42 68 Tv = W[7];
Chris@42 69 {
Chris@42 70 E Tp, Tj, TN, TJ;
Chris@42 71 Tp = Ti * T6;
Chris@42 72 T24 = FMA(Th, T3, T1e);
Chris@42 73 T1f = FNMS(Th, T3, T1e);
Chris@42 74 T1D = FNMS(T5, T3, Ta);
Chris@42 75 Tb = FMA(T5, T3, Ta);
Chris@42 76 Tj = Ti * T3;
Chris@42 77 {
Chris@42 78 E T1a, T4, Tu, T1G;
Chris@42 79 T1a = Tf * T3;
Chris@42 80 T4 = T2 * T3;
Chris@42 81 Tu = Ti * Tt;
Chris@42 82 T1G = T2 * Tt;
Chris@42 83 {
Chris@42 84 E T12, Tz, T1K, T16;
Chris@42 85 T12 = Tf * Tt;
Chris@42 86 Tz = Ti * Tv;
Chris@42 87 T1K = T2 * Tv;
Chris@42 88 T16 = Tf * Tv;
Chris@42 89 T1P = FNMS(Tl, T6, Tj);
Chris@42 90 Tm = FMA(Tl, T6, Tj);
Chris@42 91 T21 = FNMS(Th, T6, T1a);
Chris@42 92 T1b = FMA(Th, T6, T1a);
Chris@42 93 T7 = FNMS(T5, T6, T4);
Chris@42 94 T1A = FMA(T5, T6, T4);
Chris@42 95 Tw = FMA(Tl, Tv, Tu);
Chris@42 96 T1H = FMA(T5, Tv, T1G);
Chris@42 97 T13 = FMA(Th, Tv, T12);
Chris@42 98 TA = FNMS(Tl, Tt, Tz);
Chris@42 99 T1L = FNMS(T5, Tt, T1K);
Chris@42 100 T17 = FNMS(Th, Tt, T16);
Chris@42 101 T1S = FMA(Tl, T3, Tp);
Chris@42 102 Tq = FNMS(Tl, T3, Tp);
Chris@42 103 }
Chris@42 104 }
Chris@42 105 T1o = T1n * T3;
Chris@42 106 T2g = T1n * Tv;
Chris@42 107 TN = Tm * Tv;
Chris@42 108 TJ = Tm * Tt;
Chris@42 109 T1t = T1n * T6;
Chris@42 110 T2c = T1n * Tt;
Chris@42 111 TO = FNMS(Tq, Tt, TN);
Chris@42 112 TK = FMA(Tq, Tv, TJ);
Chris@42 113 }
Chris@42 114 }
Chris@42 115 {
Chris@42 116 E Te, T2C, T4K, T57, T58, TD, T2H, T4L, T3u, T3Z, T11, T2v, T2P, T3P, T4n;
Chris@42 117 E T4v, T3C, T43, T2r, T2z, T3b, T3T, T4d, T4z, T3J, T42, T20, T2y, T34, T3S;
Chris@42 118 E T4g, T4y, T1c, T19, T1d, T3j, T1w, T2U, T1g, T1j, T1l;
Chris@42 119 {
Chris@42 120 E T2d, T2h, T2k, T1q, T1u, T2n, TL, TI, TM, T3q, TZ, T2N, TP, TS, TU;
Chris@42 121 {
Chris@42 122 E T1, T4J, T8, T9, Tc;
Chris@42 123 T1 = cr[0];
Chris@42 124 T4J = ci[0];
Chris@42 125 T8 = cr[WS(rs, 10)];
Chris@42 126 T2d = FMA(T1p, Tv, T2c);
Chris@42 127 T2h = FNMS(T1p, Tt, T2g);
Chris@42 128 T2k = FMA(T1p, T6, T1o);
Chris@42 129 T1q = FNMS(T1p, T6, T1o);
Chris@42 130 T1u = FMA(T1p, T3, T1t);
Chris@42 131 T2n = FNMS(T1p, T3, T1t);
Chris@42 132 T9 = T7 * T8;
Chris@42 133 Tc = ci[WS(rs, 10)];
Chris@42 134 {
Chris@42 135 E Tx, Ts, T2F, TC, T2E;
Chris@42 136 {
Chris@42 137 E Tn, Tr, To, T2D, T4I, Ty, TB, Td, T4H;
Chris@42 138 Tn = cr[WS(rs, 5)];
Chris@42 139 Tr = ci[WS(rs, 5)];
Chris@42 140 Tx = cr[WS(rs, 15)];
Chris@42 141 Td = FMA(Tb, Tc, T9);
Chris@42 142 T4H = T7 * Tc;
Chris@42 143 To = Tm * Tn;
Chris@42 144 T2D = Tm * Tr;
Chris@42 145 Te = T1 + Td;
Chris@42 146 T2C = T1 - Td;
Chris@42 147 T4I = FNMS(Tb, T8, T4H);
Chris@42 148 Ty = Tw * Tx;
Chris@42 149 TB = ci[WS(rs, 15)];
Chris@42 150 Ts = FMA(Tq, Tr, To);
Chris@42 151 T4K = T4I + T4J;
Chris@42 152 T57 = T4J - T4I;
Chris@42 153 T2F = Tw * TB;
Chris@42 154 TC = FMA(TA, TB, Ty);
Chris@42 155 T2E = FNMS(Tq, Tn, T2D);
Chris@42 156 }
Chris@42 157 {
Chris@42 158 E TF, TG, TH, TW, TY, T2G, T3p, TX, T2M;
Chris@42 159 TF = cr[WS(rs, 4)];
Chris@42 160 T2G = FNMS(TA, Tx, T2F);
Chris@42 161 T58 = Ts - TC;
Chris@42 162 TD = Ts + TC;
Chris@42 163 TG = Ti * TF;
Chris@42 164 T2H = T2E - T2G;
Chris@42 165 T4L = T2E + T2G;
Chris@42 166 TH = ci[WS(rs, 4)];
Chris@42 167 TW = cr[WS(rs, 19)];
Chris@42 168 TY = ci[WS(rs, 19)];
Chris@42 169 TL = cr[WS(rs, 14)];
Chris@42 170 TI = FMA(Tl, TH, TG);
Chris@42 171 T3p = Ti * TH;
Chris@42 172 TX = Tt * TW;
Chris@42 173 T2M = Tt * TY;
Chris@42 174 TM = TK * TL;
Chris@42 175 T3q = FNMS(Tl, TF, T3p);
Chris@42 176 TZ = FMA(Tv, TY, TX);
Chris@42 177 T2N = FNMS(Tv, TW, T2M);
Chris@42 178 TP = ci[WS(rs, 14)];
Chris@42 179 TS = cr[WS(rs, 9)];
Chris@42 180 TU = ci[WS(rs, 9)];
Chris@42 181 }
Chris@42 182 }
Chris@42 183 }
Chris@42 184 {
Chris@42 185 E T27, T26, T28, T3y, T2p, T39, T29, T2e, T2i;
Chris@42 186 {
Chris@42 187 E T22, T23, T25, T2l, T2o, T3x, T2m, T38;
Chris@42 188 {
Chris@42 189 E TR, T2J, T3s, TV, T2L, T4m, T3t;
Chris@42 190 T22 = cr[WS(rs, 12)];
Chris@42 191 {
Chris@42 192 E TQ, T3r, TT, T2K;
Chris@42 193 TQ = FMA(TO, TP, TM);
Chris@42 194 T3r = TK * TP;
Chris@42 195 TT = T3 * TS;
Chris@42 196 T2K = T3 * TU;
Chris@42 197 TR = TI + TQ;
Chris@42 198 T2J = TI - TQ;
Chris@42 199 T3s = FNMS(TO, TL, T3r);
Chris@42 200 TV = FMA(T6, TU, TT);
Chris@42 201 T2L = FNMS(T6, TS, T2K);
Chris@42 202 T23 = T21 * T22;
Chris@42 203 }
Chris@42 204 T4m = T3q + T3s;
Chris@42 205 T3t = T3q - T3s;
Chris@42 206 {
Chris@42 207 E T10, T3o, T4l, T2O;
Chris@42 208 T10 = TV + TZ;
Chris@42 209 T3o = TZ - TV;
Chris@42 210 T4l = T2L + T2N;
Chris@42 211 T2O = T2L - T2N;
Chris@42 212 T3u = T3o - T3t;
Chris@42 213 T3Z = T3t + T3o;
Chris@42 214 T11 = TR - T10;
Chris@42 215 T2v = TR + T10;
Chris@42 216 T2P = T2J - T2O;
Chris@42 217 T3P = T2J + T2O;
Chris@42 218 T4n = T4l - T4m;
Chris@42 219 T4v = T4m + T4l;
Chris@42 220 T25 = ci[WS(rs, 12)];
Chris@42 221 }
Chris@42 222 }
Chris@42 223 T2l = cr[WS(rs, 7)];
Chris@42 224 T2o = ci[WS(rs, 7)];
Chris@42 225 T27 = cr[WS(rs, 2)];
Chris@42 226 T26 = FMA(T24, T25, T23);
Chris@42 227 T3x = T21 * T25;
Chris@42 228 T2m = T2k * T2l;
Chris@42 229 T38 = T2k * T2o;
Chris@42 230 T28 = T1n * T27;
Chris@42 231 T3y = FNMS(T24, T22, T3x);
Chris@42 232 T2p = FMA(T2n, T2o, T2m);
Chris@42 233 T39 = FNMS(T2n, T2l, T38);
Chris@42 234 T29 = ci[WS(rs, 2)];
Chris@42 235 T2e = cr[WS(rs, 17)];
Chris@42 236 T2i = ci[WS(rs, 17)];
Chris@42 237 }
Chris@42 238 {
Chris@42 239 E T1I, T1F, T1J, T3F, T1Y, T32, T1M, T1Q, T1T;
Chris@42 240 {
Chris@42 241 E T1B, T1C, T1E, T1V, T1X, T3E, T1W, T31;
Chris@42 242 {
Chris@42 243 E T2b, T35, T3A, T2j, T37, T4c, T3B;
Chris@42 244 T1B = cr[WS(rs, 8)];
Chris@42 245 {
Chris@42 246 E T2a, T3z, T2f, T36;
Chris@42 247 T2a = FMA(T1p, T29, T28);
Chris@42 248 T3z = T1n * T29;
Chris@42 249 T2f = T2d * T2e;
Chris@42 250 T36 = T2d * T2i;
Chris@42 251 T2b = T26 + T2a;
Chris@42 252 T35 = T26 - T2a;
Chris@42 253 T3A = FNMS(T1p, T27, T3z);
Chris@42 254 T2j = FMA(T2h, T2i, T2f);
Chris@42 255 T37 = FNMS(T2h, T2e, T36);
Chris@42 256 T1C = T1A * T1B;
Chris@42 257 }
Chris@42 258 T4c = T3y + T3A;
Chris@42 259 T3B = T3y - T3A;
Chris@42 260 {
Chris@42 261 E T2q, T3w, T4b, T3a;
Chris@42 262 T2q = T2j + T2p;
Chris@42 263 T3w = T2p - T2j;
Chris@42 264 T4b = T37 + T39;
Chris@42 265 T3a = T37 - T39;
Chris@42 266 T3C = T3w - T3B;
Chris@42 267 T43 = T3B + T3w;
Chris@42 268 T2r = T2b - T2q;
Chris@42 269 T2z = T2b + T2q;
Chris@42 270 T3b = T35 - T3a;
Chris@42 271 T3T = T35 + T3a;
Chris@42 272 T4d = T4b - T4c;
Chris@42 273 T4z = T4c + T4b;
Chris@42 274 T1E = ci[WS(rs, 8)];
Chris@42 275 }
Chris@42 276 }
Chris@42 277 T1V = cr[WS(rs, 3)];
Chris@42 278 T1X = ci[WS(rs, 3)];
Chris@42 279 T1I = cr[WS(rs, 18)];
Chris@42 280 T1F = FMA(T1D, T1E, T1C);
Chris@42 281 T3E = T1A * T1E;
Chris@42 282 T1W = Tf * T1V;
Chris@42 283 T31 = Tf * T1X;
Chris@42 284 T1J = T1H * T1I;
Chris@42 285 T3F = FNMS(T1D, T1B, T3E);
Chris@42 286 T1Y = FMA(Th, T1X, T1W);
Chris@42 287 T32 = FNMS(Th, T1V, T31);
Chris@42 288 T1M = ci[WS(rs, 18)];
Chris@42 289 T1Q = cr[WS(rs, 13)];
Chris@42 290 T1T = ci[WS(rs, 13)];
Chris@42 291 }
Chris@42 292 {
Chris@42 293 E T14, T15, T18, T1r, T1v, T3i, T1s, T2T;
Chris@42 294 {
Chris@42 295 E T1O, T2Y, T3H, T1U, T30, T4f, T3I;
Chris@42 296 T14 = cr[WS(rs, 16)];
Chris@42 297 {
Chris@42 298 E T1N, T3G, T1R, T2Z;
Chris@42 299 T1N = FMA(T1L, T1M, T1J);
Chris@42 300 T3G = T1H * T1M;
Chris@42 301 T1R = T1P * T1Q;
Chris@42 302 T2Z = T1P * T1T;
Chris@42 303 T1O = T1F + T1N;
Chris@42 304 T2Y = T1F - T1N;
Chris@42 305 T3H = FNMS(T1L, T1I, T3G);
Chris@42 306 T1U = FMA(T1S, T1T, T1R);
Chris@42 307 T30 = FNMS(T1S, T1Q, T2Z);
Chris@42 308 T15 = T13 * T14;
Chris@42 309 }
Chris@42 310 T4f = T3F + T3H;
Chris@42 311 T3I = T3F - T3H;
Chris@42 312 {
Chris@42 313 E T1Z, T3D, T4e, T33;
Chris@42 314 T1Z = T1U + T1Y;
Chris@42 315 T3D = T1Y - T1U;
Chris@42 316 T4e = T30 + T32;
Chris@42 317 T33 = T30 - T32;
Chris@42 318 T3J = T3D - T3I;
Chris@42 319 T42 = T3I + T3D;
Chris@42 320 T20 = T1O - T1Z;
Chris@42 321 T2y = T1O + T1Z;
Chris@42 322 T34 = T2Y - T33;
Chris@42 323 T3S = T2Y + T33;
Chris@42 324 T4g = T4e - T4f;
Chris@42 325 T4y = T4f + T4e;
Chris@42 326 T18 = ci[WS(rs, 16)];
Chris@42 327 }
Chris@42 328 }
Chris@42 329 T1r = cr[WS(rs, 11)];
Chris@42 330 T1v = ci[WS(rs, 11)];
Chris@42 331 T1c = cr[WS(rs, 6)];
Chris@42 332 T19 = FMA(T17, T18, T15);
Chris@42 333 T3i = T13 * T18;
Chris@42 334 T1s = T1q * T1r;
Chris@42 335 T2T = T1q * T1v;
Chris@42 336 T1d = T1b * T1c;
Chris@42 337 T3j = FNMS(T17, T14, T3i);
Chris@42 338 T1w = FMA(T1u, T1v, T1s);
Chris@42 339 T2U = FNMS(T1u, T1r, T2T);
Chris@42 340 T1g = ci[WS(rs, 6)];
Chris@42 341 T1j = cr[WS(rs, 1)];
Chris@42 342 T1l = ci[WS(rs, 1)];
Chris@42 343 }
Chris@42 344 }
Chris@42 345 }
Chris@42 346 }
Chris@42 347 {
Chris@42 348 E T4F, T4Q, T4R, T5a, T4E, T5b, T2I, T5h, T5g, T4W, T4X, T53, T52, T5l, T5m;
Chris@42 349 E T5s, T2X, T3N, T3L, T3c, T5t;
Chris@42 350 {
Chris@42 351 E T2u, T3n, T2w, T2W, T4w, T4r, T4p, T45, T47, T3O, T3R, T4a, T4q, T3U;
Chris@42 352 {
Chris@42 353 E T4h, TE, T40, T3Q, T4k, T1z, T2s, T49, T48;
Chris@42 354 {
Chris@42 355 E T1i, T2Q, T3l, T1m, T2S, T4j, T3m;
Chris@42 356 T4h = T4d - T4g;
Chris@42 357 T4F = T4g + T4d;
Chris@42 358 {
Chris@42 359 E T1h, T3k, T1k, T2R;
Chris@42 360 T1h = FMA(T1f, T1g, T1d);
Chris@42 361 T3k = T1b * T1g;
Chris@42 362 T1k = T2 * T1j;
Chris@42 363 T2R = T2 * T1l;
Chris@42 364 T1i = T19 + T1h;
Chris@42 365 T2Q = T19 - T1h;
Chris@42 366 T3l = FNMS(T1f, T1c, T3k);
Chris@42 367 T1m = FMA(T5, T1l, T1k);
Chris@42 368 T2S = FNMS(T5, T1j, T2R);
Chris@42 369 }
Chris@42 370 TE = Te - TD;
Chris@42 371 T2u = Te + TD;
Chris@42 372 T4j = T3j + T3l;
Chris@42 373 T3m = T3j - T3l;
Chris@42 374 {
Chris@42 375 E T1x, T3h, T4i, T2V, T1y;
Chris@42 376 T1x = T1m + T1w;
Chris@42 377 T3h = T1w - T1m;
Chris@42 378 T4i = T2S + T2U;
Chris@42 379 T2V = T2S - T2U;
Chris@42 380 T3n = T3h - T3m;
Chris@42 381 T40 = T3m + T3h;
Chris@42 382 T1y = T1i - T1x;
Chris@42 383 T2w = T1i + T1x;
Chris@42 384 T2W = T2Q - T2V;
Chris@42 385 T3Q = T2Q + T2V;
Chris@42 386 T4k = T4i - T4j;
Chris@42 387 T4w = T4j + T4i;
Chris@42 388 T4Q = T1y - T11;
Chris@42 389 T1z = T11 + T1y;
Chris@42 390 T2s = T20 + T2r;
Chris@42 391 T4R = T20 - T2r;
Chris@42 392 }
Chris@42 393 }
Chris@42 394 {
Chris@42 395 E T41, T4o, T44, T2t;
Chris@42 396 T5a = T3Z + T40;
Chris@42 397 T41 = T3Z - T40;
Chris@42 398 T4o = T4k - T4n;
Chris@42 399 T4E = T4n + T4k;
Chris@42 400 T5b = T42 + T43;
Chris@42 401 T44 = T42 - T43;
Chris@42 402 T49 = T1z - T2s;
Chris@42 403 T2t = T1z + T2s;
Chris@42 404 T4r = FMA(KP618033988, T4h, T4o);
Chris@42 405 T4p = FNMS(KP618033988, T4o, T4h);
Chris@42 406 T45 = FMA(KP618033988, T44, T41);
Chris@42 407 T47 = FNMS(KP618033988, T41, T44);
Chris@42 408 ci[WS(rs, 9)] = TE + T2t;
Chris@42 409 T48 = FNMS(KP250000000, T2t, TE);
Chris@42 410 }
Chris@42 411 T3O = T2C + T2H;
Chris@42 412 T2I = T2C - T2H;
Chris@42 413 T5h = T3P - T3Q;
Chris@42 414 T3R = T3P + T3Q;
Chris@42 415 T4a = FNMS(KP559016994, T49, T48);
Chris@42 416 T4q = FMA(KP559016994, T49, T48);
Chris@42 417 T3U = T3S + T3T;
Chris@42 418 T5g = T3S - T3T;
Chris@42 419 }
Chris@42 420 {
Chris@42 421 E T2x, T4B, T4D, T2A, T3Y, T46;
Chris@42 422 {
Chris@42 423 E T4x, T3X, T3V, T4A, T3W;
Chris@42 424 T4W = T4v + T4w;
Chris@42 425 T4x = T4v - T4w;
Chris@42 426 ci[WS(rs, 1)] = FMA(KP951056516, T4p, T4a);
Chris@42 427 cr[WS(rs, 2)] = FNMS(KP951056516, T4p, T4a);
Chris@42 428 cr[WS(rs, 6)] = FMA(KP951056516, T4r, T4q);
Chris@42 429 ci[WS(rs, 5)] = FNMS(KP951056516, T4r, T4q);
Chris@42 430 T3X = T3R - T3U;
Chris@42 431 T3V = T3R + T3U;
Chris@42 432 T4A = T4y - T4z;
Chris@42 433 T4X = T4y + T4z;
Chris@42 434 T2x = T2v + T2w;
Chris@42 435 T53 = T2v - T2w;
Chris@42 436 cr[WS(rs, 5)] = T3O + T3V;
Chris@42 437 T3W = FNMS(KP250000000, T3V, T3O);
Chris@42 438 T4B = FMA(KP618033988, T4A, T4x);
Chris@42 439 T4D = FNMS(KP618033988, T4x, T4A);
Chris@42 440 T52 = T2z - T2y;
Chris@42 441 T2A = T2y + T2z;
Chris@42 442 T3Y = FMA(KP559016994, T3X, T3W);
Chris@42 443 T46 = FNMS(KP559016994, T3X, T3W);
Chris@42 444 }
Chris@42 445 {
Chris@42 446 E T3v, T4t, T4s, T3K, T2B, T4u, T4C;
Chris@42 447 T3v = T3n - T3u;
Chris@42 448 T5l = T3u + T3n;
Chris@42 449 T2B = T2x + T2A;
Chris@42 450 T4t = T2x - T2A;
Chris@42 451 cr[WS(rs, 9)] = FNMS(KP951056516, T45, T3Y);
Chris@42 452 cr[WS(rs, 1)] = FMA(KP951056516, T45, T3Y);
Chris@42 453 ci[WS(rs, 6)] = FMA(KP951056516, T47, T46);
Chris@42 454 ci[WS(rs, 2)] = FNMS(KP951056516, T47, T46);
Chris@42 455 cr[0] = T2u + T2B;
Chris@42 456 T4s = FNMS(KP250000000, T2B, T2u);
Chris@42 457 T5m = T3J + T3C;
Chris@42 458 T3K = T3C - T3J;
Chris@42 459 T5s = T2P - T2W;
Chris@42 460 T2X = T2P + T2W;
Chris@42 461 T4u = FMA(KP559016994, T4t, T4s);
Chris@42 462 T4C = FNMS(KP559016994, T4t, T4s);
Chris@42 463 T3N = FNMS(KP618033988, T3v, T3K);
Chris@42 464 T3L = FMA(KP618033988, T3K, T3v);
Chris@42 465 ci[WS(rs, 3)] = FMA(KP951056516, T4B, T4u);
Chris@42 466 cr[WS(rs, 4)] = FNMS(KP951056516, T4B, T4u);
Chris@42 467 cr[WS(rs, 8)] = FMA(KP951056516, T4D, T4C);
Chris@42 468 ci[WS(rs, 7)] = FNMS(KP951056516, T4D, T4C);
Chris@42 469 T3c = T34 + T3b;
Chris@42 470 T5t = T34 - T3b;
Chris@42 471 }
Chris@42 472 }
Chris@42 473 }
Chris@42 474 {
Chris@42 475 E T4V, T5i, T5k, T59, T5e, T5c;
Chris@42 476 {
Chris@42 477 E T4M, T3f, T4U, T4S, T3e, T3d;
Chris@42 478 T4V = T4L + T4K;
Chris@42 479 T4M = T4K - T4L;
Chris@42 480 T3f = T2X - T3c;
Chris@42 481 T3d = T2X + T3c;
Chris@42 482 T4U = FMA(KP618033988, T4Q, T4R);
Chris@42 483 T4S = FNMS(KP618033988, T4R, T4Q);
Chris@42 484 ci[WS(rs, 4)] = T2I + T3d;
Chris@42 485 T3e = FNMS(KP250000000, T3d, T2I);
Chris@42 486 {
Chris@42 487 E T4O, T4N, T3g, T3M, T4G, T4T, T4P;
Chris@42 488 T3g = FMA(KP559016994, T3f, T3e);
Chris@42 489 T3M = FNMS(KP559016994, T3f, T3e);
Chris@42 490 T4O = T4F - T4E;
Chris@42 491 T4G = T4E + T4F;
Chris@42 492 ci[WS(rs, 8)] = FMA(KP951056516, T3L, T3g);
Chris@42 493 ci[0] = FNMS(KP951056516, T3L, T3g);
Chris@42 494 cr[WS(rs, 7)] = FNMS(KP951056516, T3N, T3M);
Chris@42 495 cr[WS(rs, 3)] = FMA(KP951056516, T3N, T3M);
Chris@42 496 cr[WS(rs, 10)] = T4G - T4M;
Chris@42 497 T4N = FMA(KP250000000, T4G, T4M);
Chris@42 498 T5i = FNMS(KP618033988, T5h, T5g);
Chris@42 499 T5k = FMA(KP618033988, T5g, T5h);
Chris@42 500 T59 = T57 - T58;
Chris@42 501 T5o = T58 + T57;
Chris@42 502 T4T = FNMS(KP559016994, T4O, T4N);
Chris@42 503 T4P = FMA(KP559016994, T4O, T4N);
Chris@42 504 ci[WS(rs, 13)] = FMA(KP951056516, T4S, T4P);
Chris@42 505 cr[WS(rs, 14)] = FMS(KP951056516, T4S, T4P);
Chris@42 506 ci[WS(rs, 17)] = FMA(KP951056516, T4U, T4T);
Chris@42 507 cr[WS(rs, 18)] = FMS(KP951056516, T4U, T4T);
Chris@42 508 T5e = T5a - T5b;
Chris@42 509 T5c = T5a + T5b;
Chris@42 510 }
Chris@42 511 }
Chris@42 512 {
Chris@42 513 E T56, T54, T4Y, T50, T5d, T5f, T5j, T4Z, T55, T51;
Chris@42 514 ci[WS(rs, 14)] = T5c + T59;
Chris@42 515 T5d = FNMS(KP250000000, T5c, T59);
Chris@42 516 T56 = FNMS(KP618033988, T52, T53);
Chris@42 517 T54 = FMA(KP618033988, T53, T52);
Chris@42 518 T5f = FNMS(KP559016994, T5e, T5d);
Chris@42 519 T5j = FMA(KP559016994, T5e, T5d);
Chris@42 520 cr[WS(rs, 17)] = -(FMA(KP951056516, T5i, T5f));
Chris@42 521 cr[WS(rs, 13)] = FMS(KP951056516, T5i, T5f);
Chris@42 522 ci[WS(rs, 18)] = FNMS(KP951056516, T5k, T5j);
Chris@42 523 ci[WS(rs, 10)] = FMA(KP951056516, T5k, T5j);
Chris@42 524 T4Y = T4W + T4X;
Chris@42 525 T50 = T4W - T4X;
Chris@42 526 ci[WS(rs, 19)] = T4Y + T4V;
Chris@42 527 T4Z = FNMS(KP250000000, T4Y, T4V);
Chris@42 528 T5u = FMA(KP618033988, T5t, T5s);
Chris@42 529 T5w = FNMS(KP618033988, T5s, T5t);
Chris@42 530 T55 = FMA(KP559016994, T50, T4Z);
Chris@42 531 T51 = FNMS(KP559016994, T50, T4Z);
Chris@42 532 ci[WS(rs, 11)] = FMA(KP951056516, T54, T51);
Chris@42 533 cr[WS(rs, 12)] = FMS(KP951056516, T54, T51);
Chris@42 534 ci[WS(rs, 15)] = FMA(KP951056516, T56, T55);
Chris@42 535 cr[WS(rs, 16)] = FMS(KP951056516, T56, T55);
Chris@42 536 T5q = T5l - T5m;
Chris@42 537 T5n = T5l + T5m;
Chris@42 538 }
Chris@42 539 }
Chris@42 540 }
Chris@42 541 }
Chris@42 542 }
Chris@42 543 cr[WS(rs, 15)] = T5n - T5o;
Chris@42 544 T5p = FMA(KP250000000, T5n, T5o);
Chris@42 545 T5v = FMA(KP559016994, T5q, T5p);
Chris@42 546 T5r = FNMS(KP559016994, T5q, T5p);
Chris@42 547 cr[WS(rs, 19)] = -(FMA(KP951056516, T5u, T5r));
Chris@42 548 cr[WS(rs, 11)] = FMS(KP951056516, T5u, T5r);
Chris@42 549 ci[WS(rs, 16)] = FNMS(KP951056516, T5w, T5v);
Chris@42 550 ci[WS(rs, 12)] = FMA(KP951056516, T5w, T5v);
Chris@42 551 }
Chris@42 552 }
Chris@42 553 }
Chris@42 554
Chris@42 555 static const tw_instr twinstr[] = {
Chris@42 556 {TW_CEXP, 1, 1},
Chris@42 557 {TW_CEXP, 1, 3},
Chris@42 558 {TW_CEXP, 1, 9},
Chris@42 559 {TW_CEXP, 1, 19},
Chris@42 560 {TW_NEXT, 1, 0}
Chris@42 561 };
Chris@42 562
Chris@42 563 static const hc2hc_desc desc = { 20, "hf2_20", twinstr, &GENUS, {136, 58, 140, 0} };
Chris@42 564
Chris@42 565 void X(codelet_hf2_20) (planner *p) {
Chris@42 566 X(khc2hc_register) (p, hf2_20, &desc);
Chris@42 567 }
Chris@42 568 #else /* HAVE_FMA */
Chris@42 569
Chris@42 570 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -dit -name hf2_20 -include hf.h */
Chris@42 571
Chris@42 572 /*
Chris@42 573 * This function contains 276 FP additions, 164 FP multiplications,
Chris@42 574 * (or, 204 additions, 92 multiplications, 72 fused multiply/add),
Chris@42 575 * 123 stack variables, 4 constants, and 80 memory accesses
Chris@42 576 */
Chris@42 577 #include "hf.h"
Chris@42 578
Chris@42 579 static void hf2_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 580 {
Chris@42 581 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 582 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 583 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 584 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 585 {
Chris@42 586 INT m;
Chris@42 587 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 588 E T2, T5, Tg, Ti, Tk, To, T1h, T1f, T6, T3, T8, T14, T1Q, Tc, T1O;
Chris@42 589 E T1v, T18, T1t, T1n, T24, T1j, T22, Tq, Tu, T1E, T1G, Tx, Ty, Tz, TJ;
Chris@42 590 E T1Z, TB, T1X, T1A, TZ, TL, T1y, TX;
Chris@42 591 {
Chris@42 592 E T7, T16, Ta, T13, T4, T17, Tb, T12;
Chris@42 593 {
Chris@42 594 E Th, Tn, Tj, Tm;
Chris@42 595 T2 = W[0];
Chris@42 596 T5 = W[1];
Chris@42 597 Tg = W[2];
Chris@42 598 Ti = W[3];
Chris@42 599 Th = T2 * Tg;
Chris@42 600 Tn = T5 * Tg;
Chris@42 601 Tj = T5 * Ti;
Chris@42 602 Tm = T2 * Ti;
Chris@42 603 Tk = Th - Tj;
Chris@42 604 To = Tm + Tn;
Chris@42 605 T1h = Tm - Tn;
Chris@42 606 T1f = Th + Tj;
Chris@42 607 T6 = W[5];
Chris@42 608 T7 = T5 * T6;
Chris@42 609 T16 = Tg * T6;
Chris@42 610 Ta = T2 * T6;
Chris@42 611 T13 = Ti * T6;
Chris@42 612 T3 = W[4];
Chris@42 613 T4 = T2 * T3;
Chris@42 614 T17 = Ti * T3;
Chris@42 615 Tb = T5 * T3;
Chris@42 616 T12 = Tg * T3;
Chris@42 617 }
Chris@42 618 T8 = T4 - T7;
Chris@42 619 T14 = T12 + T13;
Chris@42 620 T1Q = T16 + T17;
Chris@42 621 Tc = Ta + Tb;
Chris@42 622 T1O = T12 - T13;
Chris@42 623 T1v = Ta - Tb;
Chris@42 624 T18 = T16 - T17;
Chris@42 625 T1t = T4 + T7;
Chris@42 626 {
Chris@42 627 E T1l, T1m, T1g, T1i;
Chris@42 628 T1l = T1f * T6;
Chris@42 629 T1m = T1h * T3;
Chris@42 630 T1n = T1l + T1m;
Chris@42 631 T24 = T1l - T1m;
Chris@42 632 T1g = T1f * T3;
Chris@42 633 T1i = T1h * T6;
Chris@42 634 T1j = T1g - T1i;
Chris@42 635 T22 = T1g + T1i;
Chris@42 636 {
Chris@42 637 E Tl, Tp, Ts, Tt;
Chris@42 638 Tl = Tk * T3;
Chris@42 639 Tp = To * T6;
Chris@42 640 Tq = Tl + Tp;
Chris@42 641 Ts = Tk * T6;
Chris@42 642 Tt = To * T3;
Chris@42 643 Tu = Ts - Tt;
Chris@42 644 T1E = Tl - Tp;
Chris@42 645 T1G = Ts + Tt;
Chris@42 646 Tx = W[6];
Chris@42 647 Ty = W[7];
Chris@42 648 Tz = FMA(Tk, Tx, To * Ty);
Chris@42 649 TJ = FMA(Tq, Tx, Tu * Ty);
Chris@42 650 T1Z = FNMS(T1h, Tx, T1f * Ty);
Chris@42 651 TB = FNMS(To, Tx, Tk * Ty);
Chris@42 652 T1X = FMA(T1f, Tx, T1h * Ty);
Chris@42 653 T1A = FNMS(T5, Tx, T2 * Ty);
Chris@42 654 TZ = FNMS(Ti, Tx, Tg * Ty);
Chris@42 655 TL = FNMS(Tu, Tx, Tq * Ty);
Chris@42 656 T1y = FMA(T2, Tx, T5 * Ty);
Chris@42 657 TX = FMA(Tg, Tx, Ti * Ty);
Chris@42 658 }
Chris@42 659 }
Chris@42 660 }
Chris@42 661 {
Chris@42 662 E TF, T2b, T4D, T4M, T2K, T3r, T4a, T4m, T1N, T28, T29, T3C, T3F, T43, T3X;
Chris@42 663 E T3Y, T4o, T2f, T2g, T2h, T2y, T2D, T2E, T3g, T3h, T4z, T3n, T3o, T3p, T33;
Chris@42 664 E T38, T4K, TW, T1r, T1s, T3J, T3M, T44, T3U, T3V, T4n, T2c, T2d, T2e, T2n;
Chris@42 665 E T2s, T2t, T3d, T3e, T4y, T3k, T3l, T3m, T2S, T2X, T4J;
Chris@42 666 {
Chris@42 667 E T1, T47, Te, T46, Tw, T2H, TD, T2I, T9, Td;
Chris@42 668 T1 = cr[0];
Chris@42 669 T47 = ci[0];
Chris@42 670 T9 = cr[WS(rs, 10)];
Chris@42 671 Td = ci[WS(rs, 10)];
Chris@42 672 Te = FMA(T8, T9, Tc * Td);
Chris@42 673 T46 = FNMS(Tc, T9, T8 * Td);
Chris@42 674 {
Chris@42 675 E Tr, Tv, TA, TC;
Chris@42 676 Tr = cr[WS(rs, 5)];
Chris@42 677 Tv = ci[WS(rs, 5)];
Chris@42 678 Tw = FMA(Tq, Tr, Tu * Tv);
Chris@42 679 T2H = FNMS(Tu, Tr, Tq * Tv);
Chris@42 680 TA = cr[WS(rs, 15)];
Chris@42 681 TC = ci[WS(rs, 15)];
Chris@42 682 TD = FMA(Tz, TA, TB * TC);
Chris@42 683 T2I = FNMS(TB, TA, Tz * TC);
Chris@42 684 }
Chris@42 685 {
Chris@42 686 E Tf, TE, T4B, T4C;
Chris@42 687 Tf = T1 + Te;
Chris@42 688 TE = Tw + TD;
Chris@42 689 TF = Tf - TE;
Chris@42 690 T2b = Tf + TE;
Chris@42 691 T4B = T47 - T46;
Chris@42 692 T4C = Tw - TD;
Chris@42 693 T4D = T4B - T4C;
Chris@42 694 T4M = T4C + T4B;
Chris@42 695 }
Chris@42 696 {
Chris@42 697 E T2G, T2J, T48, T49;
Chris@42 698 T2G = T1 - Te;
Chris@42 699 T2J = T2H - T2I;
Chris@42 700 T2K = T2G - T2J;
Chris@42 701 T3r = T2G + T2J;
Chris@42 702 T48 = T46 + T47;
Chris@42 703 T49 = T2H + T2I;
Chris@42 704 T4a = T48 - T49;
Chris@42 705 T4m = T49 + T48;
Chris@42 706 }
Chris@42 707 }
Chris@42 708 {
Chris@42 709 E T1D, T3A, T2u, T31, T27, T3D, T2C, T37, T1M, T3B, T2x, T32, T1W, T3E, T2z;
Chris@42 710 E T36;
Chris@42 711 {
Chris@42 712 E T1x, T2Z, T1C, T30;
Chris@42 713 {
Chris@42 714 E T1u, T1w, T1z, T1B;
Chris@42 715 T1u = cr[WS(rs, 8)];
Chris@42 716 T1w = ci[WS(rs, 8)];
Chris@42 717 T1x = FMA(T1t, T1u, T1v * T1w);
Chris@42 718 T2Z = FNMS(T1v, T1u, T1t * T1w);
Chris@42 719 T1z = cr[WS(rs, 18)];
Chris@42 720 T1B = ci[WS(rs, 18)];
Chris@42 721 T1C = FMA(T1y, T1z, T1A * T1B);
Chris@42 722 T30 = FNMS(T1A, T1z, T1y * T1B);
Chris@42 723 }
Chris@42 724 T1D = T1x + T1C;
Chris@42 725 T3A = T2Z + T30;
Chris@42 726 T2u = T1x - T1C;
Chris@42 727 T31 = T2Z - T30;
Chris@42 728 }
Chris@42 729 {
Chris@42 730 E T21, T2A, T26, T2B;
Chris@42 731 {
Chris@42 732 E T1Y, T20, T23, T25;
Chris@42 733 T1Y = cr[WS(rs, 17)];
Chris@42 734 T20 = ci[WS(rs, 17)];
Chris@42 735 T21 = FMA(T1X, T1Y, T1Z * T20);
Chris@42 736 T2A = FNMS(T1Z, T1Y, T1X * T20);
Chris@42 737 T23 = cr[WS(rs, 7)];
Chris@42 738 T25 = ci[WS(rs, 7)];
Chris@42 739 T26 = FMA(T22, T23, T24 * T25);
Chris@42 740 T2B = FNMS(T24, T23, T22 * T25);
Chris@42 741 }
Chris@42 742 T27 = T21 + T26;
Chris@42 743 T3D = T2A + T2B;
Chris@42 744 T2C = T2A - T2B;
Chris@42 745 T37 = T21 - T26;
Chris@42 746 }
Chris@42 747 {
Chris@42 748 E T1I, T2v, T1L, T2w;
Chris@42 749 {
Chris@42 750 E T1F, T1H, T1J, T1K;
Chris@42 751 T1F = cr[WS(rs, 13)];
Chris@42 752 T1H = ci[WS(rs, 13)];
Chris@42 753 T1I = FMA(T1E, T1F, T1G * T1H);
Chris@42 754 T2v = FNMS(T1G, T1F, T1E * T1H);
Chris@42 755 T1J = cr[WS(rs, 3)];
Chris@42 756 T1K = ci[WS(rs, 3)];
Chris@42 757 T1L = FMA(Tg, T1J, Ti * T1K);
Chris@42 758 T2w = FNMS(Ti, T1J, Tg * T1K);
Chris@42 759 }
Chris@42 760 T1M = T1I + T1L;
Chris@42 761 T3B = T2v + T2w;
Chris@42 762 T2x = T2v - T2w;
Chris@42 763 T32 = T1I - T1L;
Chris@42 764 }
Chris@42 765 {
Chris@42 766 E T1S, T34, T1V, T35;
Chris@42 767 {
Chris@42 768 E T1P, T1R, T1T, T1U;
Chris@42 769 T1P = cr[WS(rs, 12)];
Chris@42 770 T1R = ci[WS(rs, 12)];
Chris@42 771 T1S = FMA(T1O, T1P, T1Q * T1R);
Chris@42 772 T34 = FNMS(T1Q, T1P, T1O * T1R);
Chris@42 773 T1T = cr[WS(rs, 2)];
Chris@42 774 T1U = ci[WS(rs, 2)];
Chris@42 775 T1V = FMA(T1f, T1T, T1h * T1U);
Chris@42 776 T35 = FNMS(T1h, T1T, T1f * T1U);
Chris@42 777 }
Chris@42 778 T1W = T1S + T1V;
Chris@42 779 T3E = T34 + T35;
Chris@42 780 T2z = T1S - T1V;
Chris@42 781 T36 = T34 - T35;
Chris@42 782 }
Chris@42 783 T1N = T1D - T1M;
Chris@42 784 T28 = T1W - T27;
Chris@42 785 T29 = T1N + T28;
Chris@42 786 T3C = T3A - T3B;
Chris@42 787 T3F = T3D - T3E;
Chris@42 788 T43 = T3F - T3C;
Chris@42 789 T3X = T3A + T3B;
Chris@42 790 T3Y = T3E + T3D;
Chris@42 791 T4o = T3X + T3Y;
Chris@42 792 T2f = T1D + T1M;
Chris@42 793 T2g = T1W + T27;
Chris@42 794 T2h = T2f + T2g;
Chris@42 795 T2y = T2u - T2x;
Chris@42 796 T2D = T2z - T2C;
Chris@42 797 T2E = T2y + T2D;
Chris@42 798 T3g = T31 - T32;
Chris@42 799 T3h = T36 - T37;
Chris@42 800 T4z = T3g + T3h;
Chris@42 801 T3n = T2u + T2x;
Chris@42 802 T3o = T2z + T2C;
Chris@42 803 T3p = T3n + T3o;
Chris@42 804 T33 = T31 + T32;
Chris@42 805 T38 = T36 + T37;
Chris@42 806 T4K = T33 + T38;
Chris@42 807 }
Chris@42 808 {
Chris@42 809 E TO, T3H, T2j, T2Q, T1q, T3L, T2r, T2T, TV, T3I, T2m, T2R, T1b, T3K, T2o;
Chris@42 810 E T2W;
Chris@42 811 {
Chris@42 812 E TI, T2O, TN, T2P;
Chris@42 813 {
Chris@42 814 E TG, TH, TK, TM;
Chris@42 815 TG = cr[WS(rs, 4)];
Chris@42 816 TH = ci[WS(rs, 4)];
Chris@42 817 TI = FMA(Tk, TG, To * TH);
Chris@42 818 T2O = FNMS(To, TG, Tk * TH);
Chris@42 819 TK = cr[WS(rs, 14)];
Chris@42 820 TM = ci[WS(rs, 14)];
Chris@42 821 TN = FMA(TJ, TK, TL * TM);
Chris@42 822 T2P = FNMS(TL, TK, TJ * TM);
Chris@42 823 }
Chris@42 824 TO = TI + TN;
Chris@42 825 T3H = T2O + T2P;
Chris@42 826 T2j = TI - TN;
Chris@42 827 T2Q = T2O - T2P;
Chris@42 828 }
Chris@42 829 {
Chris@42 830 E T1e, T2p, T1p, T2q;
Chris@42 831 {
Chris@42 832 E T1c, T1d, T1k, T1o;
Chris@42 833 T1c = cr[WS(rs, 1)];
Chris@42 834 T1d = ci[WS(rs, 1)];
Chris@42 835 T1e = FMA(T2, T1c, T5 * T1d);
Chris@42 836 T2p = FNMS(T5, T1c, T2 * T1d);
Chris@42 837 T1k = cr[WS(rs, 11)];
Chris@42 838 T1o = ci[WS(rs, 11)];
Chris@42 839 T1p = FMA(T1j, T1k, T1n * T1o);
Chris@42 840 T2q = FNMS(T1n, T1k, T1j * T1o);
Chris@42 841 }
Chris@42 842 T1q = T1e + T1p;
Chris@42 843 T3L = T2p + T2q;
Chris@42 844 T2r = T2p - T2q;
Chris@42 845 T2T = T1p - T1e;
Chris@42 846 }
Chris@42 847 {
Chris@42 848 E TR, T2k, TU, T2l;
Chris@42 849 {
Chris@42 850 E TP, TQ, TS, TT;
Chris@42 851 TP = cr[WS(rs, 9)];
Chris@42 852 TQ = ci[WS(rs, 9)];
Chris@42 853 TR = FMA(T3, TP, T6 * TQ);
Chris@42 854 T2k = FNMS(T6, TP, T3 * TQ);
Chris@42 855 TS = cr[WS(rs, 19)];
Chris@42 856 TT = ci[WS(rs, 19)];
Chris@42 857 TU = FMA(Tx, TS, Ty * TT);
Chris@42 858 T2l = FNMS(Ty, TS, Tx * TT);
Chris@42 859 }
Chris@42 860 TV = TR + TU;
Chris@42 861 T3I = T2k + T2l;
Chris@42 862 T2m = T2k - T2l;
Chris@42 863 T2R = TR - TU;
Chris@42 864 }
Chris@42 865 {
Chris@42 866 E T11, T2U, T1a, T2V;
Chris@42 867 {
Chris@42 868 E TY, T10, T15, T19;
Chris@42 869 TY = cr[WS(rs, 16)];
Chris@42 870 T10 = ci[WS(rs, 16)];
Chris@42 871 T11 = FMA(TX, TY, TZ * T10);
Chris@42 872 T2U = FNMS(TZ, TY, TX * T10);
Chris@42 873 T15 = cr[WS(rs, 6)];
Chris@42 874 T19 = ci[WS(rs, 6)];
Chris@42 875 T1a = FMA(T14, T15, T18 * T19);
Chris@42 876 T2V = FNMS(T18, T15, T14 * T19);
Chris@42 877 }
Chris@42 878 T1b = T11 + T1a;
Chris@42 879 T3K = T2U + T2V;
Chris@42 880 T2o = T11 - T1a;
Chris@42 881 T2W = T2U - T2V;
Chris@42 882 }
Chris@42 883 TW = TO - TV;
Chris@42 884 T1r = T1b - T1q;
Chris@42 885 T1s = TW + T1r;
Chris@42 886 T3J = T3H - T3I;
Chris@42 887 T3M = T3K - T3L;
Chris@42 888 T44 = T3J + T3M;
Chris@42 889 T3U = T3H + T3I;
Chris@42 890 T3V = T3K + T3L;
Chris@42 891 T4n = T3U + T3V;
Chris@42 892 T2c = TO + TV;
Chris@42 893 T2d = T1b + T1q;
Chris@42 894 T2e = T2c + T2d;
Chris@42 895 T2n = T2j - T2m;
Chris@42 896 T2s = T2o - T2r;
Chris@42 897 T2t = T2n + T2s;
Chris@42 898 T3d = T2Q - T2R;
Chris@42 899 T3e = T2W + T2T;
Chris@42 900 T4y = T3d + T3e;
Chris@42 901 T3k = T2j + T2m;
Chris@42 902 T3l = T2o + T2r;
Chris@42 903 T3m = T3k + T3l;
Chris@42 904 T2S = T2Q + T2R;
Chris@42 905 T2X = T2T - T2W;
Chris@42 906 T4J = T2X - T2S;
Chris@42 907 }
Chris@42 908 {
Chris@42 909 E T3y, T2a, T3x, T3O, T3Q, T3G, T3N, T3P, T3z;
Chris@42 910 T3y = KP559016994 * (T1s - T29);
Chris@42 911 T2a = T1s + T29;
Chris@42 912 T3x = FNMS(KP250000000, T2a, TF);
Chris@42 913 T3G = T3C + T3F;
Chris@42 914 T3N = T3J - T3M;
Chris@42 915 T3O = FNMS(KP587785252, T3N, KP951056516 * T3G);
Chris@42 916 T3Q = FMA(KP951056516, T3N, KP587785252 * T3G);
Chris@42 917 ci[WS(rs, 9)] = TF + T2a;
Chris@42 918 T3P = T3y + T3x;
Chris@42 919 ci[WS(rs, 5)] = T3P - T3Q;
Chris@42 920 cr[WS(rs, 6)] = T3P + T3Q;
Chris@42 921 T3z = T3x - T3y;
Chris@42 922 cr[WS(rs, 2)] = T3z - T3O;
Chris@42 923 ci[WS(rs, 1)] = T3z + T3O;
Chris@42 924 }
Chris@42 925 {
Chris@42 926 E T3q, T3s, T3t, T3j, T3w, T3f, T3i, T3v, T3u;
Chris@42 927 T3q = KP559016994 * (T3m - T3p);
Chris@42 928 T3s = T3m + T3p;
Chris@42 929 T3t = FNMS(KP250000000, T3s, T3r);
Chris@42 930 T3f = T3d - T3e;
Chris@42 931 T3i = T3g - T3h;
Chris@42 932 T3j = FMA(KP951056516, T3f, KP587785252 * T3i);
Chris@42 933 T3w = FNMS(KP587785252, T3f, KP951056516 * T3i);
Chris@42 934 cr[WS(rs, 5)] = T3r + T3s;
Chris@42 935 T3v = T3t - T3q;
Chris@42 936 ci[WS(rs, 2)] = T3v - T3w;
Chris@42 937 ci[WS(rs, 6)] = T3w + T3v;
Chris@42 938 T3u = T3q + T3t;
Chris@42 939 cr[WS(rs, 1)] = T3j + T3u;
Chris@42 940 cr[WS(rs, 9)] = T3u - T3j;
Chris@42 941 }
Chris@42 942 {
Chris@42 943 E T3R, T2i, T3S, T40, T42, T3W, T3Z, T41, T3T;
Chris@42 944 T3R = KP559016994 * (T2e - T2h);
Chris@42 945 T2i = T2e + T2h;
Chris@42 946 T3S = FNMS(KP250000000, T2i, T2b);
Chris@42 947 T3W = T3U - T3V;
Chris@42 948 T3Z = T3X - T3Y;
Chris@42 949 T40 = FMA(KP951056516, T3W, KP587785252 * T3Z);
Chris@42 950 T42 = FNMS(KP587785252, T3W, KP951056516 * T3Z);
Chris@42 951 cr[0] = T2b + T2i;
Chris@42 952 T41 = T3S - T3R;
Chris@42 953 ci[WS(rs, 7)] = T41 - T42;
Chris@42 954 cr[WS(rs, 8)] = T41 + T42;
Chris@42 955 T3T = T3R + T3S;
Chris@42 956 cr[WS(rs, 4)] = T3T - T40;
Chris@42 957 ci[WS(rs, 3)] = T3T + T40;
Chris@42 958 }
Chris@42 959 {
Chris@42 960 E T2F, T2L, T2M, T3a, T3b, T2Y, T39, T3c, T2N;
Chris@42 961 T2F = KP559016994 * (T2t - T2E);
Chris@42 962 T2L = T2t + T2E;
Chris@42 963 T2M = FNMS(KP250000000, T2L, T2K);
Chris@42 964 T2Y = T2S + T2X;
Chris@42 965 T39 = T33 - T38;
Chris@42 966 T3a = FMA(KP951056516, T2Y, KP587785252 * T39);
Chris@42 967 T3b = FNMS(KP587785252, T2Y, KP951056516 * T39);
Chris@42 968 ci[WS(rs, 4)] = T2K + T2L;
Chris@42 969 T3c = T2M - T2F;
Chris@42 970 cr[WS(rs, 3)] = T3b + T3c;
Chris@42 971 cr[WS(rs, 7)] = T3c - T3b;
Chris@42 972 T2N = T2F + T2M;
Chris@42 973 ci[0] = T2N - T3a;
Chris@42 974 ci[WS(rs, 8)] = T3a + T2N;
Chris@42 975 }
Chris@42 976 {
Chris@42 977 E T4e, T45, T4f, T4d, T4h, T4b, T4c, T4i, T4g;
Chris@42 978 T4e = KP559016994 * (T44 + T43);
Chris@42 979 T45 = T43 - T44;
Chris@42 980 T4f = FMA(KP250000000, T45, T4a);
Chris@42 981 T4b = T1r - TW;
Chris@42 982 T4c = T1N - T28;
Chris@42 983 T4d = FNMS(KP587785252, T4c, KP951056516 * T4b);
Chris@42 984 T4h = FMA(KP587785252, T4b, KP951056516 * T4c);
Chris@42 985 cr[WS(rs, 10)] = T45 - T4a;
Chris@42 986 T4i = T4f - T4e;
Chris@42 987 cr[WS(rs, 18)] = T4h - T4i;
Chris@42 988 ci[WS(rs, 17)] = T4h + T4i;
Chris@42 989 T4g = T4e + T4f;
Chris@42 990 cr[WS(rs, 14)] = T4d - T4g;
Chris@42 991 ci[WS(rs, 13)] = T4d + T4g;
Chris@42 992 }
Chris@42 993 {
Chris@42 994 E T4A, T4E, T4F, T4x, T4H, T4v, T4w, T4I, T4G;
Chris@42 995 T4A = KP559016994 * (T4y - T4z);
Chris@42 996 T4E = T4y + T4z;
Chris@42 997 T4F = FNMS(KP250000000, T4E, T4D);
Chris@42 998 T4v = T3n - T3o;
Chris@42 999 T4w = T3k - T3l;
Chris@42 1000 T4x = FNMS(KP587785252, T4w, KP951056516 * T4v);
Chris@42 1001 T4H = FMA(KP951056516, T4w, KP587785252 * T4v);
Chris@42 1002 ci[WS(rs, 14)] = T4E + T4D;
Chris@42 1003 T4I = T4A + T4F;
Chris@42 1004 ci[WS(rs, 10)] = T4H + T4I;
Chris@42 1005 ci[WS(rs, 18)] = T4I - T4H;
Chris@42 1006 T4G = T4A - T4F;
Chris@42 1007 cr[WS(rs, 13)] = T4x + T4G;
Chris@42 1008 cr[WS(rs, 17)] = T4G - T4x;
Chris@42 1009 }
Chris@42 1010 {
Chris@42 1011 E T4r, T4p, T4q, T4l, T4t, T4j, T4k, T4u, T4s;
Chris@42 1012 T4r = KP559016994 * (T4n - T4o);
Chris@42 1013 T4p = T4n + T4o;
Chris@42 1014 T4q = FNMS(KP250000000, T4p, T4m);
Chris@42 1015 T4j = T2c - T2d;
Chris@42 1016 T4k = T2f - T2g;
Chris@42 1017 T4l = FNMS(KP951056516, T4k, KP587785252 * T4j);
Chris@42 1018 T4t = FMA(KP951056516, T4j, KP587785252 * T4k);
Chris@42 1019 ci[WS(rs, 19)] = T4p + T4m;
Chris@42 1020 T4u = T4r + T4q;
Chris@42 1021 cr[WS(rs, 16)] = T4t - T4u;
Chris@42 1022 ci[WS(rs, 15)] = T4t + T4u;
Chris@42 1023 T4s = T4q - T4r;
Chris@42 1024 cr[WS(rs, 12)] = T4l - T4s;
Chris@42 1025 ci[WS(rs, 11)] = T4l + T4s;
Chris@42 1026 }
Chris@42 1027 {
Chris@42 1028 E T4Q, T4L, T4R, T4P, T4T, T4N, T4O, T4U, T4S;
Chris@42 1029 T4Q = KP559016994 * (T4J + T4K);
Chris@42 1030 T4L = T4J - T4K;
Chris@42 1031 T4R = FMA(KP250000000, T4L, T4M);
Chris@42 1032 T4N = T2n - T2s;
Chris@42 1033 T4O = T2y - T2D;
Chris@42 1034 T4P = FMA(KP951056516, T4N, KP587785252 * T4O);
Chris@42 1035 T4T = FNMS(KP587785252, T4N, KP951056516 * T4O);
Chris@42 1036 cr[WS(rs, 15)] = T4L - T4M;
Chris@42 1037 T4U = T4Q + T4R;
Chris@42 1038 ci[WS(rs, 12)] = T4T + T4U;
Chris@42 1039 ci[WS(rs, 16)] = T4U - T4T;
Chris@42 1040 T4S = T4Q - T4R;
Chris@42 1041 cr[WS(rs, 11)] = T4P + T4S;
Chris@42 1042 cr[WS(rs, 19)] = T4S - T4P;
Chris@42 1043 }
Chris@42 1044 }
Chris@42 1045 }
Chris@42 1046 }
Chris@42 1047 }
Chris@42 1048
Chris@42 1049 static const tw_instr twinstr[] = {
Chris@42 1050 {TW_CEXP, 1, 1},
Chris@42 1051 {TW_CEXP, 1, 3},
Chris@42 1052 {TW_CEXP, 1, 9},
Chris@42 1053 {TW_CEXP, 1, 19},
Chris@42 1054 {TW_NEXT, 1, 0}
Chris@42 1055 };
Chris@42 1056
Chris@42 1057 static const hc2hc_desc desc = { 20, "hf2_20", twinstr, &GENUS, {204, 92, 72, 0} };
Chris@42 1058
Chris@42 1059 void X(codelet_hf2_20) (planner *p) {
Chris@42 1060 X(khc2hc_register) (p, hf2_20, &desc);
Chris@42 1061 }
Chris@42 1062 #endif /* HAVE_FMA */