annotate src/fftw-3.3.5/rdft/scalar/r2cb/hb2_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:50:22 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 20 -dif -name hb2_20 -include hb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 276 FP additions, 198 FP multiplications,
Chris@42 32 * (or, 136 additions, 58 multiplications, 140 fused multiply/add),
Chris@42 33 * 153 stack variables, 4 constants, and 80 memory accesses
Chris@42 34 */
Chris@42 35 #include "hb.h"
Chris@42 36
Chris@42 37 static void hb2_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 42 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 46 E T1S, T1O, T1s, TI, T24, T1Y, T2g, T2k, TS, TR, T1I, T26, T1o, T20, T1F;
Chris@42 47 E T25, TT, T1Z;
Chris@42 48 {
Chris@42 49 E TD, TH, TE, T1L, T1N, T1X, TG, T1V, T2Y, T2b, T29, T2s, T36, T3e, T31;
Chris@42 50 E T2o, T3b, T5b, T2c, T2U, T4y, T4u, T2f, T5g, T47, T5p, T4b, T5l;
Chris@42 51 {
Chris@42 52 E T1r, TF, T2T, T1M, T1R, T2X, T2r, T4x;
Chris@42 53 TD = W[0];
Chris@42 54 TH = W[3];
Chris@42 55 TE = W[2];
Chris@42 56 T1L = W[6];
Chris@42 57 T1N = W[7];
Chris@42 58 T1r = TD * TH;
Chris@42 59 TF = TD * TE;
Chris@42 60 T2T = TE * T1L;
Chris@42 61 T1M = TD * T1L;
Chris@42 62 T1R = TD * T1N;
Chris@42 63 T2X = TE * T1N;
Chris@42 64 T1X = W[5];
Chris@42 65 TG = W[1];
Chris@42 66 T1V = W[4];
Chris@42 67 T2Y = FNMS(TH, T1L, T2X);
Chris@42 68 T2r = TD * T1X;
Chris@42 69 {
Chris@42 70 E T23, T2n, T1W, T2a;
Chris@42 71 T23 = TE * T1X;
Chris@42 72 T1S = FNMS(TG, T1L, T1R);
Chris@42 73 T1O = FMA(TG, T1N, T1M);
Chris@42 74 T2b = FMA(TG, TE, T1r);
Chris@42 75 T1s = FNMS(TG, TE, T1r);
Chris@42 76 T29 = FNMS(TG, TH, TF);
Chris@42 77 TI = FMA(TG, TH, TF);
Chris@42 78 T2n = TD * T1V;
Chris@42 79 T1W = TE * T1V;
Chris@42 80 T2s = FMA(TG, T1V, T2r);
Chris@42 81 T36 = FNMS(TG, T1V, T2r);
Chris@42 82 T3e = FMA(TH, T1V, T23);
Chris@42 83 T24 = FNMS(TH, T1V, T23);
Chris@42 84 T2a = T29 * T1V;
Chris@42 85 T31 = FMA(TG, T1X, T2n);
Chris@42 86 T2o = FNMS(TG, T1X, T2n);
Chris@42 87 T3b = FNMS(TH, T1X, T1W);
Chris@42 88 T1Y = FMA(TH, T1X, T1W);
Chris@42 89 T5b = FNMS(T2b, T1X, T2a);
Chris@42 90 T2c = FMA(T2b, T1X, T2a);
Chris@42 91 T2U = FMA(TH, T1N, T2T);
Chris@42 92 }
Chris@42 93 T4x = T29 * T1N;
Chris@42 94 {
Chris@42 95 E T4t, T2d, T2j, T2e;
Chris@42 96 T4t = T29 * T1L;
Chris@42 97 T2e = T29 * T1X;
Chris@42 98 T4y = FNMS(T2b, T1L, T4x);
Chris@42 99 T4u = FMA(T2b, T1N, T4t);
Chris@42 100 T2f = FNMS(T2b, T1V, T2e);
Chris@42 101 T5g = FMA(T2b, T1V, T2e);
Chris@42 102 T2d = T2c * T1L;
Chris@42 103 T2j = T2c * T1N;
Chris@42 104 T47 = TI * T1V;
Chris@42 105 T2g = FMA(T2f, T1N, T2d);
Chris@42 106 T2k = FNMS(T2f, T1L, T2j);
Chris@42 107 T5p = TI * T1N;
Chris@42 108 T4b = TI * T1X;
Chris@42 109 T5l = TI * T1L;
Chris@42 110 }
Chris@42 111 }
Chris@42 112 {
Chris@42 113 E T4f, T48, T4c, T4k, T5m, T5q, T3j, T4B, T7, TJ, T4V, T3V, T1z, T2H, T3x;
Chris@42 114 E T42, T18, T3q, T43, T1n, T2D, T53, T52, T2A, T1H, T4R, T4X, T4W, T4O, T1G;
Chris@42 115 E T2O, T3I, T2P, T3P, T2K, T2M, T1C, T1E, TC, T2w, T40, T3Y, T4K, T4I, TQ;
Chris@42 116 {
Chris@42 117 E T1y, T3U, T1v, T3T;
Chris@42 118 {
Chris@42 119 E T3h, T3, T1t, T3i, T6, T1u;
Chris@42 120 {
Chris@42 121 E T1w, T1x, T1, T2, T4, T5;
Chris@42 122 T1 = cr[0];
Chris@42 123 T2 = ci[WS(rs, 9)];
Chris@42 124 T1w = ci[WS(rs, 14)];
Chris@42 125 T4f = FNMS(T1s, T1X, T47);
Chris@42 126 T48 = FMA(T1s, T1X, T47);
Chris@42 127 T4c = FNMS(T1s, T1V, T4b);
Chris@42 128 T4k = FMA(T1s, T1V, T4b);
Chris@42 129 T5m = FMA(T1s, T1N, T5l);
Chris@42 130 T5q = FNMS(T1s, T1L, T5p);
Chris@42 131 T3h = T1 - T2;
Chris@42 132 T3 = T1 + T2;
Chris@42 133 T1x = cr[WS(rs, 15)];
Chris@42 134 T4 = cr[WS(rs, 5)];
Chris@42 135 T5 = ci[WS(rs, 4)];
Chris@42 136 T1t = ci[WS(rs, 19)];
Chris@42 137 T3i = T1w + T1x;
Chris@42 138 T1y = T1w - T1x;
Chris@42 139 T3U = T4 - T5;
Chris@42 140 T6 = T4 + T5;
Chris@42 141 T1u = cr[WS(rs, 10)];
Chris@42 142 }
Chris@42 143 T3j = T3h + T3i;
Chris@42 144 T4B = T3h - T3i;
Chris@42 145 T7 = T3 + T6;
Chris@42 146 TJ = T3 - T6;
Chris@42 147 T1v = T1t - T1u;
Chris@42 148 T3T = T1t + T1u;
Chris@42 149 }
Chris@42 150 {
Chris@42 151 E T3m, T4C, Te, TK, T4M, T3L, T1f, T2y, TO, TA, T4Q, T3H, T3w, T4G, T2C;
Chris@42 152 E T17, T3p, T4D, Tl, TL, T3O, T4N, T1m, T2z, T3t, T4F, Tt, TN, T3E, T4P;
Chris@42 153 E T10, T2B;
Chris@42 154 {
Chris@42 155 E T3u, T13, T3v, T16;
Chris@42 156 {
Chris@42 157 E T1e, T3K, T1b, T3J;
Chris@42 158 {
Chris@42 159 E T3k, Ta, T19, T3l, Td, T1a;
Chris@42 160 {
Chris@42 161 E T1c, T1d, T8, T9, Tb, Tc;
Chris@42 162 T8 = cr[WS(rs, 4)];
Chris@42 163 T9 = ci[WS(rs, 5)];
Chris@42 164 T4V = T3U + T3T;
Chris@42 165 T3V = T3T - T3U;
Chris@42 166 T1z = T1v - T1y;
Chris@42 167 T2H = T1v + T1y;
Chris@42 168 T3k = T8 - T9;
Chris@42 169 Ta = T8 + T9;
Chris@42 170 T1c = ci[WS(rs, 10)];
Chris@42 171 T1d = cr[WS(rs, 19)];
Chris@42 172 Tb = cr[WS(rs, 9)];
Chris@42 173 Tc = ci[0];
Chris@42 174 T19 = ci[WS(rs, 15)];
Chris@42 175 T3l = T1c + T1d;
Chris@42 176 T1e = T1c - T1d;
Chris@42 177 T3K = Tb - Tc;
Chris@42 178 Td = Tb + Tc;
Chris@42 179 T1a = cr[WS(rs, 14)];
Chris@42 180 }
Chris@42 181 T3m = T3k + T3l;
Chris@42 182 T4C = T3k - T3l;
Chris@42 183 Te = Ta + Td;
Chris@42 184 TK = Ta - Td;
Chris@42 185 T1b = T19 - T1a;
Chris@42 186 T3J = T19 + T1a;
Chris@42 187 }
Chris@42 188 {
Chris@42 189 E Tw, T14, T3F, Tz, T3G, T15;
Chris@42 190 {
Chris@42 191 E Tx, Ty, Tu, Tv, T11, T12;
Chris@42 192 Tu = ci[WS(rs, 7)];
Chris@42 193 Tv = cr[WS(rs, 2)];
Chris@42 194 T4M = T3K + T3J;
Chris@42 195 T3L = T3J - T3K;
Chris@42 196 T1f = T1b - T1e;
Chris@42 197 T2y = T1b + T1e;
Chris@42 198 T3u = Tu - Tv;
Chris@42 199 Tw = Tu + Tv;
Chris@42 200 Tx = ci[WS(rs, 2)];
Chris@42 201 Ty = cr[WS(rs, 7)];
Chris@42 202 T11 = ci[WS(rs, 17)];
Chris@42 203 T12 = cr[WS(rs, 12)];
Chris@42 204 T14 = ci[WS(rs, 12)];
Chris@42 205 T3F = Tx - Ty;
Chris@42 206 Tz = Tx + Ty;
Chris@42 207 T3G = T11 + T12;
Chris@42 208 T13 = T11 - T12;
Chris@42 209 T15 = cr[WS(rs, 17)];
Chris@42 210 }
Chris@42 211 TO = Tw - Tz;
Chris@42 212 TA = Tw + Tz;
Chris@42 213 T4Q = T3F - T3G;
Chris@42 214 T3H = T3F + T3G;
Chris@42 215 T3v = T14 + T15;
Chris@42 216 T16 = T14 - T15;
Chris@42 217 }
Chris@42 218 }
Chris@42 219 {
Chris@42 220 E Ti, T3n, Th, T3o, T1l, Tj, T1g, T1h;
Chris@42 221 {
Chris@42 222 E Tf, Tg, T1j, T1k;
Chris@42 223 Tf = ci[WS(rs, 3)];
Chris@42 224 T3w = T3u - T3v;
Chris@42 225 T4G = T3u + T3v;
Chris@42 226 T2C = T13 + T16;
Chris@42 227 T17 = T13 - T16;
Chris@42 228 Tg = cr[WS(rs, 6)];
Chris@42 229 T1j = ci[WS(rs, 18)];
Chris@42 230 T1k = cr[WS(rs, 11)];
Chris@42 231 Ti = cr[WS(rs, 1)];
Chris@42 232 T3n = Tf - Tg;
Chris@42 233 Th = Tf + Tg;
Chris@42 234 T3o = T1j + T1k;
Chris@42 235 T1l = T1j - T1k;
Chris@42 236 Tj = ci[WS(rs, 8)];
Chris@42 237 T1g = ci[WS(rs, 13)];
Chris@42 238 T1h = cr[WS(rs, 16)];
Chris@42 239 }
Chris@42 240 {
Chris@42 241 E T3M, Tk, T3N, T1i;
Chris@42 242 T3p = T3n + T3o;
Chris@42 243 T4D = T3n - T3o;
Chris@42 244 T3M = Ti - Tj;
Chris@42 245 Tk = Ti + Tj;
Chris@42 246 T3N = T1g + T1h;
Chris@42 247 T1i = T1g - T1h;
Chris@42 248 Tl = Th + Tk;
Chris@42 249 TL = Th - Tk;
Chris@42 250 T3O = T3M + T3N;
Chris@42 251 T4N = T3M - T3N;
Chris@42 252 T1m = T1i - T1l;
Chris@42 253 T2z = T1i + T1l;
Chris@42 254 }
Chris@42 255 }
Chris@42 256 {
Chris@42 257 E Tq, T3r, Tp, T3s, TZ, Tr, TU, TV;
Chris@42 258 {
Chris@42 259 E Tn, To, TX, TY;
Chris@42 260 Tn = cr[WS(rs, 8)];
Chris@42 261 To = ci[WS(rs, 1)];
Chris@42 262 TX = ci[WS(rs, 16)];
Chris@42 263 TY = cr[WS(rs, 13)];
Chris@42 264 Tq = ci[WS(rs, 6)];
Chris@42 265 T3r = Tn - To;
Chris@42 266 Tp = Tn + To;
Chris@42 267 T3s = TX + TY;
Chris@42 268 TZ = TX - TY;
Chris@42 269 Tr = cr[WS(rs, 3)];
Chris@42 270 TU = ci[WS(rs, 11)];
Chris@42 271 TV = cr[WS(rs, 18)];
Chris@42 272 }
Chris@42 273 {
Chris@42 274 E T3D, Ts, T3C, TW;
Chris@42 275 T3t = T3r - T3s;
Chris@42 276 T4F = T3r + T3s;
Chris@42 277 T3D = Tq - Tr;
Chris@42 278 Ts = Tq + Tr;
Chris@42 279 T3C = TU + TV;
Chris@42 280 TW = TU - TV;
Chris@42 281 Tt = Tp + Ts;
Chris@42 282 TN = Tp - Ts;
Chris@42 283 T3E = T3C - T3D;
Chris@42 284 T4P = T3D + T3C;
Chris@42 285 T10 = TW - TZ;
Chris@42 286 T2B = TW + TZ;
Chris@42 287 }
Chris@42 288 }
Chris@42 289 }
Chris@42 290 {
Chris@42 291 E T1B, T1A, T2J, T4H, T4E, T2I, TM, TP;
Chris@42 292 T3x = T3t + T3w;
Chris@42 293 T42 = T3t - T3w;
Chris@42 294 T18 = T10 - T17;
Chris@42 295 T1B = T10 + T17;
Chris@42 296 T3q = T3m + T3p;
Chris@42 297 T43 = T3m - T3p;
Chris@42 298 T1n = T1f - T1m;
Chris@42 299 T1A = T1f + T1m;
Chris@42 300 T2J = T2B + T2C;
Chris@42 301 T2D = T2B - T2C;
Chris@42 302 T53 = T4F - T4G;
Chris@42 303 T4H = T4F + T4G;
Chris@42 304 T4E = T4C + T4D;
Chris@42 305 T52 = T4C - T4D;
Chris@42 306 T2A = T2y - T2z;
Chris@42 307 T2I = T2y + T2z;
Chris@42 308 TM = TK + TL;
Chris@42 309 T1H = TK - TL;
Chris@42 310 T4R = T4P - T4Q;
Chris@42 311 T4X = T4P + T4Q;
Chris@42 312 T4W = T4M + T4N;
Chris@42 313 T4O = T4M - T4N;
Chris@42 314 T1G = TN - TO;
Chris@42 315 TP = TN + TO;
Chris@42 316 {
Chris@42 317 E Tm, T3X, TB, T3W;
Chris@42 318 Tm = Te + Tl;
Chris@42 319 T2O = Te - Tl;
Chris@42 320 T3I = T3E + T3H;
Chris@42 321 T3X = T3E - T3H;
Chris@42 322 TB = Tt + TA;
Chris@42 323 T2P = Tt - TA;
Chris@42 324 T3P = T3L + T3O;
Chris@42 325 T3W = T3L - T3O;
Chris@42 326 T2K = T2I + T2J;
Chris@42 327 T2M = T2I - T2J;
Chris@42 328 T1C = T1A + T1B;
Chris@42 329 T1E = T1A - T1B;
Chris@42 330 TC = Tm + TB;
Chris@42 331 T2w = Tm - TB;
Chris@42 332 T40 = T3W - T3X;
Chris@42 333 T3Y = T3W + T3X;
Chris@42 334 T4K = T4E - T4H;
Chris@42 335 T4I = T4E + T4H;
Chris@42 336 TS = TM - TP;
Chris@42 337 TQ = TM + TP;
Chris@42 338 }
Chris@42 339 }
Chris@42 340 }
Chris@42 341 }
Chris@42 342 {
Chris@42 343 E T3A, T3y, T50, T1D, T2t, T2p, T4J, T5t, T5v, T4Z, T4Y;
Chris@42 344 cr[0] = T7 + TC;
Chris@42 345 T3A = T3q - T3x;
Chris@42 346 T3y = T3q + T3x;
Chris@42 347 T50 = T4W - T4X;
Chris@42 348 T4Y = T4W + T4X;
Chris@42 349 ci[0] = T2H + T2K;
Chris@42 350 T1D = FNMS(KP250000000, T1C, T1z);
Chris@42 351 T2t = T1z + T1C;
Chris@42 352 T2p = TJ + TQ;
Chris@42 353 TR = FNMS(KP250000000, TQ, TJ);
Chris@42 354 T4J = FNMS(KP250000000, T4I, T4B);
Chris@42 355 T5t = T4B + T4I;
Chris@42 356 T5v = T4V + T4Y;
Chris@42 357 T4Z = FNMS(KP250000000, T4Y, T4V);
Chris@42 358 {
Chris@42 359 E T4m, T44, T4i, T4p, T49, T3R, T4j, T4a, T3S, T4l, T41, T4q;
Chris@42 360 {
Chris@42 361 E T3z, T4v, T4w, T3Z, T4z;
Chris@42 362 T3z = FNMS(KP250000000, T3y, T3j);
Chris@42 363 T4v = T3j + T3y;
Chris@42 364 {
Chris@42 365 E T2u, T2q, T5u, T5w;
Chris@42 366 T2u = T2s * T2p;
Chris@42 367 T2q = T2o * T2p;
Chris@42 368 T5u = T2c * T5t;
Chris@42 369 T5w = T2c * T5v;
Chris@42 370 ci[WS(rs, 10)] = FMA(T2o, T2t, T2u);
Chris@42 371 cr[WS(rs, 10)] = FNMS(T2s, T2t, T2q);
Chris@42 372 cr[WS(rs, 5)] = FNMS(T2f, T5v, T5u);
Chris@42 373 ci[WS(rs, 5)] = FMA(T2f, T5t, T5w);
Chris@42 374 T4w = T4u * T4v;
Chris@42 375 }
Chris@42 376 T3Z = FNMS(KP250000000, T3Y, T3V);
Chris@42 377 T4z = T3V + T3Y;
Chris@42 378 {
Chris@42 379 E T3Q, T4h, T4A, T4g, T3B;
Chris@42 380 T3Q = FNMS(KP618033988, T3P, T3I);
Chris@42 381 T4h = FMA(KP618033988, T3I, T3P);
Chris@42 382 cr[WS(rs, 15)] = FNMS(T4y, T4z, T4w);
Chris@42 383 T4A = T4u * T4z;
Chris@42 384 T4m = FMA(KP618033988, T42, T43);
Chris@42 385 T44 = FNMS(KP618033988, T43, T42);
Chris@42 386 T4g = FMA(KP559016994, T3A, T3z);
Chris@42 387 T3B = FNMS(KP559016994, T3A, T3z);
Chris@42 388 ci[WS(rs, 15)] = FMA(T4y, T4v, T4A);
Chris@42 389 T4i = FNMS(KP951056516, T4h, T4g);
Chris@42 390 T4p = FMA(KP951056516, T4h, T4g);
Chris@42 391 T49 = FMA(KP951056516, T3Q, T3B);
Chris@42 392 T3R = FNMS(KP951056516, T3Q, T3B);
Chris@42 393 }
Chris@42 394 T4j = T4f * T4i;
Chris@42 395 T4a = T48 * T49;
Chris@42 396 T3S = TE * T3R;
Chris@42 397 T4l = FMA(KP559016994, T40, T3Z);
Chris@42 398 T41 = FNMS(KP559016994, T40, T3Z);
Chris@42 399 T4q = T1L * T4p;
Chris@42 400 }
Chris@42 401 {
Chris@42 402 E T5d, T4S, T54, T5i, T4L, T5c;
Chris@42 403 T5d = FNMS(KP618033988, T4O, T4R);
Chris@42 404 T4S = FMA(KP618033988, T4R, T4O);
Chris@42 405 {
Chris@42 406 E T4n, T4r, T4d, T45;
Chris@42 407 T4n = FMA(KP951056516, T4m, T4l);
Chris@42 408 T4r = FNMS(KP951056516, T4m, T4l);
Chris@42 409 T4d = FNMS(KP951056516, T44, T41);
Chris@42 410 T45 = FMA(KP951056516, T44, T41);
Chris@42 411 {
Chris@42 412 E T4o, T4s, T4e, T46;
Chris@42 413 T4o = T4f * T4n;
Chris@42 414 cr[WS(rs, 11)] = FNMS(T4k, T4n, T4j);
Chris@42 415 T4s = T1L * T4r;
Chris@42 416 cr[WS(rs, 19)] = FNMS(T1N, T4r, T4q);
Chris@42 417 T4e = T48 * T4d;
Chris@42 418 cr[WS(rs, 7)] = FNMS(T4c, T4d, T4a);
Chris@42 419 T46 = TE * T45;
Chris@42 420 cr[WS(rs, 3)] = FNMS(TH, T45, T3S);
Chris@42 421 ci[WS(rs, 11)] = FMA(T4k, T4i, T4o);
Chris@42 422 ci[WS(rs, 19)] = FMA(T1N, T4p, T4s);
Chris@42 423 ci[WS(rs, 7)] = FMA(T4c, T49, T4e);
Chris@42 424 ci[WS(rs, 3)] = FMA(TH, T3R, T46);
Chris@42 425 }
Chris@42 426 }
Chris@42 427 T54 = FMA(KP618033988, T53, T52);
Chris@42 428 T5i = FNMS(KP618033988, T52, T53);
Chris@42 429 T4L = FMA(KP559016994, T4K, T4J);
Chris@42 430 T5c = FNMS(KP559016994, T4K, T4J);
Chris@42 431 {
Chris@42 432 E T38, T2Q, T33, T2E, T2v, T37, T2N, T5h, T51, T2L, T2x, T32;
Chris@42 433 T38 = FNMS(KP618033988, T2O, T2P);
Chris@42 434 T2Q = FMA(KP618033988, T2P, T2O);
Chris@42 435 T5h = FNMS(KP559016994, T50, T4Z);
Chris@42 436 T51 = FMA(KP559016994, T50, T4Z);
Chris@42 437 {
Chris@42 438 E T5e, T5n, T57, T4T;
Chris@42 439 T5e = FNMS(KP951056516, T5d, T5c);
Chris@42 440 T5n = FMA(KP951056516, T5d, T5c);
Chris@42 441 T57 = FMA(KP951056516, T4S, T4L);
Chris@42 442 T4T = FNMS(KP951056516, T4S, T4L);
Chris@42 443 {
Chris@42 444 E T5j, T5r, T59, T55;
Chris@42 445 T5j = FMA(KP951056516, T5i, T5h);
Chris@42 446 T5r = FNMS(KP951056516, T5i, T5h);
Chris@42 447 T59 = FNMS(KP951056516, T54, T51);
Chris@42 448 T55 = FMA(KP951056516, T54, T51);
Chris@42 449 {
Chris@42 450 E T5f, T5o, T58, T4U;
Chris@42 451 T5f = T5b * T5e;
Chris@42 452 T5o = T5m * T5n;
Chris@42 453 T58 = T1V * T57;
Chris@42 454 T4U = TD * T4T;
Chris@42 455 {
Chris@42 456 E T5k, T5s, T5a, T56;
Chris@42 457 T5k = T5b * T5j;
Chris@42 458 T5s = T5m * T5r;
Chris@42 459 T5a = T1V * T59;
Chris@42 460 T56 = TD * T55;
Chris@42 461 cr[WS(rs, 13)] = FNMS(T5g, T5j, T5f);
Chris@42 462 cr[WS(rs, 17)] = FNMS(T5q, T5r, T5o);
Chris@42 463 cr[WS(rs, 9)] = FNMS(T1X, T59, T58);
Chris@42 464 cr[WS(rs, 1)] = FNMS(TG, T55, T4U);
Chris@42 465 ci[WS(rs, 13)] = FMA(T5g, T5e, T5k);
Chris@42 466 ci[WS(rs, 17)] = FMA(T5q, T5n, T5s);
Chris@42 467 ci[WS(rs, 9)] = FMA(T1X, T57, T5a);
Chris@42 468 ci[WS(rs, 1)] = FMA(TG, T4T, T56);
Chris@42 469 }
Chris@42 470 }
Chris@42 471 }
Chris@42 472 }
Chris@42 473 T2L = FNMS(KP250000000, T2K, T2H);
Chris@42 474 T33 = FNMS(KP618033988, T2A, T2D);
Chris@42 475 T2E = FMA(KP618033988, T2D, T2A);
Chris@42 476 T2v = FNMS(KP250000000, TC, T7);
Chris@42 477 T37 = FNMS(KP559016994, T2M, T2L);
Chris@42 478 T2N = FMA(KP559016994, T2M, T2L);
Chris@42 479 T1I = FNMS(KP618033988, T1H, T1G);
Chris@42 480 T26 = FMA(KP618033988, T1G, T1H);
Chris@42 481 T2x = FMA(KP559016994, T2w, T2v);
Chris@42 482 T32 = FNMS(KP559016994, T2w, T2v);
Chris@42 483 {
Chris@42 484 E T3f, T39, T2R, T2Z;
Chris@42 485 T3f = FNMS(KP951056516, T38, T37);
Chris@42 486 T39 = FMA(KP951056516, T38, T37);
Chris@42 487 T2R = FNMS(KP951056516, T2Q, T2N);
Chris@42 488 T2Z = FMA(KP951056516, T2Q, T2N);
Chris@42 489 {
Chris@42 490 E T3c, T34, T2F, T2V;
Chris@42 491 T3c = FMA(KP951056516, T33, T32);
Chris@42 492 T34 = FNMS(KP951056516, T33, T32);
Chris@42 493 T2F = FMA(KP951056516, T2E, T2x);
Chris@42 494 T2V = FNMS(KP951056516, T2E, T2x);
Chris@42 495 {
Chris@42 496 E T3a, T35, T3g, T3d;
Chris@42 497 T3a = T36 * T34;
Chris@42 498 T35 = T31 * T34;
Chris@42 499 T3g = T3e * T3c;
Chris@42 500 T3d = T3b * T3c;
Chris@42 501 {
Chris@42 502 E T30, T2W, T2S, T2G;
Chris@42 503 T30 = T2Y * T2V;
Chris@42 504 T2W = T2U * T2V;
Chris@42 505 T2S = T2b * T2F;
Chris@42 506 T2G = T29 * T2F;
Chris@42 507 ci[WS(rs, 8)] = FMA(T31, T39, T3a);
Chris@42 508 cr[WS(rs, 8)] = FNMS(T36, T39, T35);
Chris@42 509 ci[WS(rs, 12)] = FMA(T3b, T3f, T3g);
Chris@42 510 cr[WS(rs, 12)] = FNMS(T3e, T3f, T3d);
Chris@42 511 ci[WS(rs, 16)] = FMA(T2U, T2Z, T30);
Chris@42 512 cr[WS(rs, 16)] = FNMS(T2Y, T2Z, T2W);
Chris@42 513 ci[WS(rs, 4)] = FMA(T29, T2R, T2S);
Chris@42 514 cr[WS(rs, 4)] = FNMS(T2b, T2R, T2G);
Chris@42 515 }
Chris@42 516 }
Chris@42 517 }
Chris@42 518 }
Chris@42 519 T1o = FNMS(KP618033988, T1n, T18);
Chris@42 520 T20 = FMA(KP618033988, T18, T1n);
Chris@42 521 T1F = FNMS(KP559016994, T1E, T1D);
Chris@42 522 T25 = FMA(KP559016994, T1E, T1D);
Chris@42 523 }
Chris@42 524 }
Chris@42 525 }
Chris@42 526 }
Chris@42 527 }
Chris@42 528 }
Chris@42 529 TT = FNMS(KP559016994, TS, TR);
Chris@42 530 T1Z = FMA(KP559016994, TS, TR);
Chris@42 531 {
Chris@42 532 E T2l, T27, T1J, T1T;
Chris@42 533 T2l = FNMS(KP951056516, T26, T25);
Chris@42 534 T27 = FMA(KP951056516, T26, T25);
Chris@42 535 T1J = FNMS(KP951056516, T1I, T1F);
Chris@42 536 T1T = FMA(KP951056516, T1I, T1F);
Chris@42 537 {
Chris@42 538 E T2h, T21, T1p, T1P;
Chris@42 539 T2h = FMA(KP951056516, T20, T1Z);
Chris@42 540 T21 = FNMS(KP951056516, T20, T1Z);
Chris@42 541 T1p = FMA(KP951056516, T1o, TT);
Chris@42 542 T1P = FNMS(KP951056516, T1o, TT);
Chris@42 543 {
Chris@42 544 E T28, T22, T2m, T2i;
Chris@42 545 T28 = T24 * T21;
Chris@42 546 T22 = T1Y * T21;
Chris@42 547 T2m = T2k * T2h;
Chris@42 548 T2i = T2g * T2h;
Chris@42 549 {
Chris@42 550 E T1U, T1Q, T1K, T1q;
Chris@42 551 T1U = T1S * T1P;
Chris@42 552 T1Q = T1O * T1P;
Chris@42 553 T1K = T1s * T1p;
Chris@42 554 T1q = TI * T1p;
Chris@42 555 ci[WS(rs, 6)] = FMA(T1Y, T27, T28);
Chris@42 556 cr[WS(rs, 6)] = FNMS(T24, T27, T22);
Chris@42 557 ci[WS(rs, 14)] = FMA(T2g, T2l, T2m);
Chris@42 558 cr[WS(rs, 14)] = FNMS(T2k, T2l, T2i);
Chris@42 559 ci[WS(rs, 18)] = FMA(T1O, T1T, T1U);
Chris@42 560 cr[WS(rs, 18)] = FNMS(T1S, T1T, T1Q);
Chris@42 561 ci[WS(rs, 2)] = FMA(TI, T1J, T1K);
Chris@42 562 cr[WS(rs, 2)] = FNMS(T1s, T1J, T1q);
Chris@42 563 }
Chris@42 564 }
Chris@42 565 }
Chris@42 566 }
Chris@42 567 }
Chris@42 568 }
Chris@42 569 }
Chris@42 570
Chris@42 571 static const tw_instr twinstr[] = {
Chris@42 572 {TW_CEXP, 1, 1},
Chris@42 573 {TW_CEXP, 1, 3},
Chris@42 574 {TW_CEXP, 1, 9},
Chris@42 575 {TW_CEXP, 1, 19},
Chris@42 576 {TW_NEXT, 1, 0}
Chris@42 577 };
Chris@42 578
Chris@42 579 static const hc2hc_desc desc = { 20, "hb2_20", twinstr, &GENUS, {136, 58, 140, 0} };
Chris@42 580
Chris@42 581 void X(codelet_hb2_20) (planner *p) {
Chris@42 582 X(khc2hc_register) (p, hb2_20, &desc);
Chris@42 583 }
Chris@42 584 #else /* HAVE_FMA */
Chris@42 585
Chris@42 586 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 20 -dif -name hb2_20 -include hb.h */
Chris@42 587
Chris@42 588 /*
Chris@42 589 * This function contains 276 FP additions, 164 FP multiplications,
Chris@42 590 * (or, 204 additions, 92 multiplications, 72 fused multiply/add),
Chris@42 591 * 137 stack variables, 4 constants, and 80 memory accesses
Chris@42 592 */
Chris@42 593 #include "hb.h"
Chris@42 594
Chris@42 595 static void hb2_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 596 {
Chris@42 597 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 598 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 599 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 600 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 601 {
Chris@42 602 INT m;
Chris@42 603 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 604 E TD, TG, TE, TH, TJ, T1t, T27, T25, T1T, T1R, T1V, T2j, T2Z, T21, T2X;
Chris@42 605 E T2T, T2n, T2P, T3V, T41, T3R, T3X, T29, T2c, T4H, T4L, T1L, T1M, T1N, T2d;
Chris@42 606 E T4R, T1P, T4P, T49, T2N, T2f, T47, T2L;
Chris@42 607 {
Chris@42 608 E T1U, T2l, T1Z, T2i, T1S, T2m, T20, T2h;
Chris@42 609 {
Chris@42 610 E TF, T1s, TI, T1r;
Chris@42 611 TD = W[0];
Chris@42 612 TG = W[1];
Chris@42 613 TE = W[2];
Chris@42 614 TH = W[3];
Chris@42 615 TF = TD * TE;
Chris@42 616 T1s = TG * TE;
Chris@42 617 TI = TG * TH;
Chris@42 618 T1r = TD * TH;
Chris@42 619 TJ = TF + TI;
Chris@42 620 T1t = T1r - T1s;
Chris@42 621 T27 = T1r + T1s;
Chris@42 622 T25 = TF - TI;
Chris@42 623 T1T = W[5];
Chris@42 624 T1U = TH * T1T;
Chris@42 625 T2l = TD * T1T;
Chris@42 626 T1Z = TE * T1T;
Chris@42 627 T2i = TG * T1T;
Chris@42 628 T1R = W[4];
Chris@42 629 T1S = TE * T1R;
Chris@42 630 T2m = TG * T1R;
Chris@42 631 T20 = TH * T1R;
Chris@42 632 T2h = TD * T1R;
Chris@42 633 }
Chris@42 634 T1V = T1S + T1U;
Chris@42 635 T2j = T2h - T2i;
Chris@42 636 T2Z = T1Z + T20;
Chris@42 637 T21 = T1Z - T20;
Chris@42 638 T2X = T1S - T1U;
Chris@42 639 T2T = T2l - T2m;
Chris@42 640 T2n = T2l + T2m;
Chris@42 641 T2P = T2h + T2i;
Chris@42 642 {
Chris@42 643 E T3T, T3U, T3P, T3Q;
Chris@42 644 T3T = TJ * T1T;
Chris@42 645 T3U = T1t * T1R;
Chris@42 646 T3V = T3T - T3U;
Chris@42 647 T41 = T3T + T3U;
Chris@42 648 T3P = TJ * T1R;
Chris@42 649 T3Q = T1t * T1T;
Chris@42 650 T3R = T3P + T3Q;
Chris@42 651 T3X = T3P - T3Q;
Chris@42 652 {
Chris@42 653 E T26, T28, T2a, T2b;
Chris@42 654 T26 = T25 * T1R;
Chris@42 655 T28 = T27 * T1T;
Chris@42 656 T29 = T26 + T28;
Chris@42 657 T2a = T25 * T1T;
Chris@42 658 T2b = T27 * T1R;
Chris@42 659 T2c = T2a - T2b;
Chris@42 660 T4H = T26 - T28;
Chris@42 661 T4L = T2a + T2b;
Chris@42 662 T1L = W[6];
Chris@42 663 T1M = W[7];
Chris@42 664 T1N = FMA(TD, T1L, TG * T1M);
Chris@42 665 T2d = FMA(T29, T1L, T2c * T1M);
Chris@42 666 T4R = FNMS(T1t, T1L, TJ * T1M);
Chris@42 667 T1P = FNMS(TG, T1L, TD * T1M);
Chris@42 668 T4P = FMA(TJ, T1L, T1t * T1M);
Chris@42 669 T49 = FNMS(T27, T1L, T25 * T1M);
Chris@42 670 T2N = FNMS(TH, T1L, TE * T1M);
Chris@42 671 T2f = FNMS(T2c, T1L, T29 * T1M);
Chris@42 672 T47 = FMA(T25, T1L, T27 * T1M);
Chris@42 673 T2L = FMA(TE, T1L, TH * T1M);
Chris@42 674 }
Chris@42 675 }
Chris@42 676 }
Chris@42 677 {
Chris@42 678 E T7, T4i, T4x, TK, T1D, T3i, T3E, T2D, T19, T3L, T3M, T1o, T2x, T4C, T4B;
Chris@42 679 E T2u, T1v, T4r, T4o, T1u, T2H, T37, T2I, T3e, T3p, T3w, T3x, Tm, TB, TC;
Chris@42 680 E T4u, T4v, T4y, T2A, T2B, T2E, T1E, T1F, T1G, T4d, T4g, T4j, T3F, T3G, T3H;
Chris@42 681 E TN, TQ, TR, T48, T4a;
Chris@42 682 {
Chris@42 683 E T3, T3g, T1C, T3h, T6, T3D, T1z, T3C;
Chris@42 684 {
Chris@42 685 E T1, T2, T1A, T1B;
Chris@42 686 T1 = cr[0];
Chris@42 687 T2 = ci[WS(rs, 9)];
Chris@42 688 T3 = T1 + T2;
Chris@42 689 T3g = T1 - T2;
Chris@42 690 T1A = ci[WS(rs, 14)];
Chris@42 691 T1B = cr[WS(rs, 15)];
Chris@42 692 T1C = T1A - T1B;
Chris@42 693 T3h = T1A + T1B;
Chris@42 694 }
Chris@42 695 {
Chris@42 696 E T4, T5, T1x, T1y;
Chris@42 697 T4 = cr[WS(rs, 5)];
Chris@42 698 T5 = ci[WS(rs, 4)];
Chris@42 699 T6 = T4 + T5;
Chris@42 700 T3D = T4 - T5;
Chris@42 701 T1x = ci[WS(rs, 19)];
Chris@42 702 T1y = cr[WS(rs, 10)];
Chris@42 703 T1z = T1x - T1y;
Chris@42 704 T3C = T1x + T1y;
Chris@42 705 }
Chris@42 706 T7 = T3 + T6;
Chris@42 707 T4i = T3g - T3h;
Chris@42 708 T4x = T3D + T3C;
Chris@42 709 TK = T3 - T6;
Chris@42 710 T1D = T1z - T1C;
Chris@42 711 T3i = T3g + T3h;
Chris@42 712 T3E = T3C - T3D;
Chris@42 713 T2D = T1z + T1C;
Chris@42 714 }
Chris@42 715 {
Chris@42 716 E Te, T4b, T4m, TL, T11, T33, T3l, T2s, TA, T4f, T4q, TP, T1n, T3d, T3v;
Chris@42 717 E T2w, Tl, T4c, T4n, TM, T18, T36, T3o, T2t, Tt, T4e, T4p, TO, T1g, T3a;
Chris@42 718 E T3s, T2v;
Chris@42 719 {
Chris@42 720 E Ta, T3j, T10, T3k, Td, T32, TX, T31;
Chris@42 721 {
Chris@42 722 E T8, T9, TY, TZ;
Chris@42 723 T8 = cr[WS(rs, 4)];
Chris@42 724 T9 = ci[WS(rs, 5)];
Chris@42 725 Ta = T8 + T9;
Chris@42 726 T3j = T8 - T9;
Chris@42 727 TY = ci[WS(rs, 10)];
Chris@42 728 TZ = cr[WS(rs, 19)];
Chris@42 729 T10 = TY - TZ;
Chris@42 730 T3k = TY + TZ;
Chris@42 731 }
Chris@42 732 {
Chris@42 733 E Tb, Tc, TV, TW;
Chris@42 734 Tb = cr[WS(rs, 9)];
Chris@42 735 Tc = ci[0];
Chris@42 736 Td = Tb + Tc;
Chris@42 737 T32 = Tb - Tc;
Chris@42 738 TV = ci[WS(rs, 15)];
Chris@42 739 TW = cr[WS(rs, 14)];
Chris@42 740 TX = TV - TW;
Chris@42 741 T31 = TV + TW;
Chris@42 742 }
Chris@42 743 Te = Ta + Td;
Chris@42 744 T4b = T3j - T3k;
Chris@42 745 T4m = T32 + T31;
Chris@42 746 TL = Ta - Td;
Chris@42 747 T11 = TX - T10;
Chris@42 748 T33 = T31 - T32;
Chris@42 749 T3l = T3j + T3k;
Chris@42 750 T2s = TX + T10;
Chris@42 751 }
Chris@42 752 {
Chris@42 753 E Tw, T3t, Tz, T3b, T1j, T3c, T1m, T3u;
Chris@42 754 {
Chris@42 755 E Tu, Tv, Tx, Ty;
Chris@42 756 Tu = ci[WS(rs, 7)];
Chris@42 757 Tv = cr[WS(rs, 2)];
Chris@42 758 Tw = Tu + Tv;
Chris@42 759 T3t = Tu - Tv;
Chris@42 760 Tx = ci[WS(rs, 2)];
Chris@42 761 Ty = cr[WS(rs, 7)];
Chris@42 762 Tz = Tx + Ty;
Chris@42 763 T3b = Tx - Ty;
Chris@42 764 }
Chris@42 765 {
Chris@42 766 E T1h, T1i, T1k, T1l;
Chris@42 767 T1h = ci[WS(rs, 17)];
Chris@42 768 T1i = cr[WS(rs, 12)];
Chris@42 769 T1j = T1h - T1i;
Chris@42 770 T3c = T1h + T1i;
Chris@42 771 T1k = ci[WS(rs, 12)];
Chris@42 772 T1l = cr[WS(rs, 17)];
Chris@42 773 T1m = T1k - T1l;
Chris@42 774 T3u = T1k + T1l;
Chris@42 775 }
Chris@42 776 TA = Tw + Tz;
Chris@42 777 T4f = T3t + T3u;
Chris@42 778 T4q = T3b - T3c;
Chris@42 779 TP = Tw - Tz;
Chris@42 780 T1n = T1j - T1m;
Chris@42 781 T3d = T3b + T3c;
Chris@42 782 T3v = T3t - T3u;
Chris@42 783 T2w = T1j + T1m;
Chris@42 784 }
Chris@42 785 {
Chris@42 786 E Th, T3m, T17, T3n, Tk, T34, T14, T35;
Chris@42 787 {
Chris@42 788 E Tf, Tg, T15, T16;
Chris@42 789 Tf = ci[WS(rs, 3)];
Chris@42 790 Tg = cr[WS(rs, 6)];
Chris@42 791 Th = Tf + Tg;
Chris@42 792 T3m = Tf - Tg;
Chris@42 793 T15 = ci[WS(rs, 18)];
Chris@42 794 T16 = cr[WS(rs, 11)];
Chris@42 795 T17 = T15 - T16;
Chris@42 796 T3n = T15 + T16;
Chris@42 797 }
Chris@42 798 {
Chris@42 799 E Ti, Tj, T12, T13;
Chris@42 800 Ti = cr[WS(rs, 1)];
Chris@42 801 Tj = ci[WS(rs, 8)];
Chris@42 802 Tk = Ti + Tj;
Chris@42 803 T34 = Ti - Tj;
Chris@42 804 T12 = ci[WS(rs, 13)];
Chris@42 805 T13 = cr[WS(rs, 16)];
Chris@42 806 T14 = T12 - T13;
Chris@42 807 T35 = T12 + T13;
Chris@42 808 }
Chris@42 809 Tl = Th + Tk;
Chris@42 810 T4c = T3m - T3n;
Chris@42 811 T4n = T34 - T35;
Chris@42 812 TM = Th - Tk;
Chris@42 813 T18 = T14 - T17;
Chris@42 814 T36 = T34 + T35;
Chris@42 815 T3o = T3m + T3n;
Chris@42 816 T2t = T14 + T17;
Chris@42 817 }
Chris@42 818 {
Chris@42 819 E Tp, T3q, T1f, T3r, Ts, T39, T1c, T38;
Chris@42 820 {
Chris@42 821 E Tn, To, T1d, T1e;
Chris@42 822 Tn = cr[WS(rs, 8)];
Chris@42 823 To = ci[WS(rs, 1)];
Chris@42 824 Tp = Tn + To;
Chris@42 825 T3q = Tn - To;
Chris@42 826 T1d = ci[WS(rs, 16)];
Chris@42 827 T1e = cr[WS(rs, 13)];
Chris@42 828 T1f = T1d - T1e;
Chris@42 829 T3r = T1d + T1e;
Chris@42 830 }
Chris@42 831 {
Chris@42 832 E Tq, Tr, T1a, T1b;
Chris@42 833 Tq = ci[WS(rs, 6)];
Chris@42 834 Tr = cr[WS(rs, 3)];
Chris@42 835 Ts = Tq + Tr;
Chris@42 836 T39 = Tq - Tr;
Chris@42 837 T1a = ci[WS(rs, 11)];
Chris@42 838 T1b = cr[WS(rs, 18)];
Chris@42 839 T1c = T1a - T1b;
Chris@42 840 T38 = T1a + T1b;
Chris@42 841 }
Chris@42 842 Tt = Tp + Ts;
Chris@42 843 T4e = T3q + T3r;
Chris@42 844 T4p = T39 + T38;
Chris@42 845 TO = Tp - Ts;
Chris@42 846 T1g = T1c - T1f;
Chris@42 847 T3a = T38 - T39;
Chris@42 848 T3s = T3q - T3r;
Chris@42 849 T2v = T1c + T1f;
Chris@42 850 }
Chris@42 851 T19 = T11 - T18;
Chris@42 852 T3L = T3l - T3o;
Chris@42 853 T3M = T3s - T3v;
Chris@42 854 T1o = T1g - T1n;
Chris@42 855 T2x = T2v - T2w;
Chris@42 856 T4C = T4e - T4f;
Chris@42 857 T4B = T4b - T4c;
Chris@42 858 T2u = T2s - T2t;
Chris@42 859 T1v = TO - TP;
Chris@42 860 T4r = T4p - T4q;
Chris@42 861 T4o = T4m - T4n;
Chris@42 862 T1u = TL - TM;
Chris@42 863 T2H = Te - Tl;
Chris@42 864 T37 = T33 + T36;
Chris@42 865 T2I = Tt - TA;
Chris@42 866 T3e = T3a + T3d;
Chris@42 867 T3p = T3l + T3o;
Chris@42 868 T3w = T3s + T3v;
Chris@42 869 T3x = T3p + T3w;
Chris@42 870 Tm = Te + Tl;
Chris@42 871 TB = Tt + TA;
Chris@42 872 TC = Tm + TB;
Chris@42 873 T4u = T4m + T4n;
Chris@42 874 T4v = T4p + T4q;
Chris@42 875 T4y = T4u + T4v;
Chris@42 876 T2A = T2s + T2t;
Chris@42 877 T2B = T2v + T2w;
Chris@42 878 T2E = T2A + T2B;
Chris@42 879 T1E = T11 + T18;
Chris@42 880 T1F = T1g + T1n;
Chris@42 881 T1G = T1E + T1F;
Chris@42 882 T4d = T4b + T4c;
Chris@42 883 T4g = T4e + T4f;
Chris@42 884 T4j = T4d + T4g;
Chris@42 885 T3F = T33 - T36;
Chris@42 886 T3G = T3a - T3d;
Chris@42 887 T3H = T3F + T3G;
Chris@42 888 TN = TL + TM;
Chris@42 889 TQ = TO + TP;
Chris@42 890 TR = TN + TQ;
Chris@42 891 }
Chris@42 892 cr[0] = T7 + TC;
Chris@42 893 ci[0] = T2D + T2E;
Chris@42 894 {
Chris@42 895 E T2k, T2o, T4T, T4U;
Chris@42 896 T2k = TK + TR;
Chris@42 897 T2o = T1D + T1G;
Chris@42 898 cr[WS(rs, 10)] = FNMS(T2n, T2o, T2j * T2k);
Chris@42 899 ci[WS(rs, 10)] = FMA(T2n, T2k, T2j * T2o);
Chris@42 900 T4T = T4i + T4j;
Chris@42 901 T4U = T4x + T4y;
Chris@42 902 cr[WS(rs, 5)] = FNMS(T2c, T4U, T29 * T4T);
Chris@42 903 ci[WS(rs, 5)] = FMA(T29, T4U, T2c * T4T);
Chris@42 904 }
Chris@42 905 T48 = T3i + T3x;
Chris@42 906 T4a = T3E + T3H;
Chris@42 907 cr[WS(rs, 15)] = FNMS(T49, T4a, T47 * T48);
Chris@42 908 ci[WS(rs, 15)] = FMA(T47, T4a, T49 * T48);
Chris@42 909 {
Chris@42 910 E T2y, T2J, T2V, T2R, T2G, T2U, T2r, T2Q;
Chris@42 911 T2y = FMA(KP951056516, T2u, KP587785252 * T2x);
Chris@42 912 T2J = FMA(KP951056516, T2H, KP587785252 * T2I);
Chris@42 913 T2V = FNMS(KP951056516, T2I, KP587785252 * T2H);
Chris@42 914 T2R = FNMS(KP951056516, T2x, KP587785252 * T2u);
Chris@42 915 {
Chris@42 916 E T2C, T2F, T2p, T2q;
Chris@42 917 T2C = KP559016994 * (T2A - T2B);
Chris@42 918 T2F = FNMS(KP250000000, T2E, T2D);
Chris@42 919 T2G = T2C + T2F;
Chris@42 920 T2U = T2F - T2C;
Chris@42 921 T2p = KP559016994 * (Tm - TB);
Chris@42 922 T2q = FNMS(KP250000000, TC, T7);
Chris@42 923 T2r = T2p + T2q;
Chris@42 924 T2Q = T2q - T2p;
Chris@42 925 }
Chris@42 926 {
Chris@42 927 E T2z, T2K, T2Y, T30;
Chris@42 928 T2z = T2r + T2y;
Chris@42 929 T2K = T2G - T2J;
Chris@42 930 cr[WS(rs, 4)] = FNMS(T27, T2K, T25 * T2z);
Chris@42 931 ci[WS(rs, 4)] = FMA(T27, T2z, T25 * T2K);
Chris@42 932 T2Y = T2Q - T2R;
Chris@42 933 T30 = T2V + T2U;
Chris@42 934 cr[WS(rs, 12)] = FNMS(T2Z, T30, T2X * T2Y);
Chris@42 935 ci[WS(rs, 12)] = FMA(T2Z, T2Y, T2X * T30);
Chris@42 936 }
Chris@42 937 {
Chris@42 938 E T2M, T2O, T2S, T2W;
Chris@42 939 T2M = T2r - T2y;
Chris@42 940 T2O = T2J + T2G;
Chris@42 941 cr[WS(rs, 16)] = FNMS(T2N, T2O, T2L * T2M);
Chris@42 942 ci[WS(rs, 16)] = FMA(T2N, T2M, T2L * T2O);
Chris@42 943 T2S = T2Q + T2R;
Chris@42 944 T2W = T2U - T2V;
Chris@42 945 cr[WS(rs, 8)] = FNMS(T2T, T2W, T2P * T2S);
Chris@42 946 ci[WS(rs, 8)] = FMA(T2T, T2S, T2P * T2W);
Chris@42 947 }
Chris@42 948 }
Chris@42 949 {
Chris@42 950 E T4s, T4D, T4N, T4I, T4A, T4M, T4l, T4J;
Chris@42 951 T4s = FMA(KP951056516, T4o, KP587785252 * T4r);
Chris@42 952 T4D = FMA(KP951056516, T4B, KP587785252 * T4C);
Chris@42 953 T4N = FNMS(KP951056516, T4C, KP587785252 * T4B);
Chris@42 954 T4I = FNMS(KP951056516, T4r, KP587785252 * T4o);
Chris@42 955 {
Chris@42 956 E T4w, T4z, T4h, T4k;
Chris@42 957 T4w = KP559016994 * (T4u - T4v);
Chris@42 958 T4z = FNMS(KP250000000, T4y, T4x);
Chris@42 959 T4A = T4w + T4z;
Chris@42 960 T4M = T4z - T4w;
Chris@42 961 T4h = KP559016994 * (T4d - T4g);
Chris@42 962 T4k = FNMS(KP250000000, T4j, T4i);
Chris@42 963 T4l = T4h + T4k;
Chris@42 964 T4J = T4k - T4h;
Chris@42 965 }
Chris@42 966 {
Chris@42 967 E T4t, T4E, T4Q, T4S;
Chris@42 968 T4t = T4l - T4s;
Chris@42 969 T4E = T4A + T4D;
Chris@42 970 cr[WS(rs, 1)] = FNMS(TG, T4E, TD * T4t);
Chris@42 971 ci[WS(rs, 1)] = FMA(TD, T4E, TG * T4t);
Chris@42 972 T4Q = T4J - T4I;
Chris@42 973 T4S = T4M + T4N;
Chris@42 974 cr[WS(rs, 17)] = FNMS(T4R, T4S, T4P * T4Q);
Chris@42 975 ci[WS(rs, 17)] = FMA(T4P, T4S, T4R * T4Q);
Chris@42 976 }
Chris@42 977 {
Chris@42 978 E T4F, T4G, T4K, T4O;
Chris@42 979 T4F = T4s + T4l;
Chris@42 980 T4G = T4A - T4D;
Chris@42 981 cr[WS(rs, 9)] = FNMS(T1T, T4G, T1R * T4F);
Chris@42 982 ci[WS(rs, 9)] = FMA(T1R, T4G, T1T * T4F);
Chris@42 983 T4K = T4I + T4J;
Chris@42 984 T4O = T4M - T4N;
Chris@42 985 cr[WS(rs, 13)] = FNMS(T4L, T4O, T4H * T4K);
Chris@42 986 ci[WS(rs, 13)] = FMA(T4H, T4O, T4L * T4K);
Chris@42 987 }
Chris@42 988 }
Chris@42 989 {
Chris@42 990 E T1p, T1w, T22, T1X, T1J, T23, TU, T1W;
Chris@42 991 T1p = FNMS(KP951056516, T1o, KP587785252 * T19);
Chris@42 992 T1w = FNMS(KP951056516, T1v, KP587785252 * T1u);
Chris@42 993 T22 = FMA(KP951056516, T1u, KP587785252 * T1v);
Chris@42 994 T1X = FMA(KP951056516, T19, KP587785252 * T1o);
Chris@42 995 {
Chris@42 996 E T1H, T1I, TS, TT;
Chris@42 997 T1H = FNMS(KP250000000, T1G, T1D);
Chris@42 998 T1I = KP559016994 * (T1E - T1F);
Chris@42 999 T1J = T1H - T1I;
Chris@42 1000 T23 = T1I + T1H;
Chris@42 1001 TS = FNMS(KP250000000, TR, TK);
Chris@42 1002 TT = KP559016994 * (TN - TQ);
Chris@42 1003 TU = TS - TT;
Chris@42 1004 T1W = TT + TS;
Chris@42 1005 }
Chris@42 1006 {
Chris@42 1007 E T1q, T1K, T2e, T2g;
Chris@42 1008 T1q = TU - T1p;
Chris@42 1009 T1K = T1w + T1J;
Chris@42 1010 cr[WS(rs, 2)] = FNMS(T1t, T1K, TJ * T1q);
Chris@42 1011 ci[WS(rs, 2)] = FMA(T1t, T1q, TJ * T1K);
Chris@42 1012 T2e = T1W + T1X;
Chris@42 1013 T2g = T23 - T22;
Chris@42 1014 cr[WS(rs, 14)] = FNMS(T2f, T2g, T2d * T2e);
Chris@42 1015 ci[WS(rs, 14)] = FMA(T2f, T2e, T2d * T2g);
Chris@42 1016 }
Chris@42 1017 {
Chris@42 1018 E T1O, T1Q, T1Y, T24;
Chris@42 1019 T1O = TU + T1p;
Chris@42 1020 T1Q = T1J - T1w;
Chris@42 1021 cr[WS(rs, 18)] = FNMS(T1P, T1Q, T1N * T1O);
Chris@42 1022 ci[WS(rs, 18)] = FMA(T1P, T1O, T1N * T1Q);
Chris@42 1023 T1Y = T1W - T1X;
Chris@42 1024 T24 = T22 + T23;
Chris@42 1025 cr[WS(rs, 6)] = FNMS(T21, T24, T1V * T1Y);
Chris@42 1026 ci[WS(rs, 6)] = FMA(T21, T1Y, T1V * T24);
Chris@42 1027 }
Chris@42 1028 }
Chris@42 1029 {
Chris@42 1030 E T3f, T3N, T43, T3Z, T3K, T42, T3A, T3Y;
Chris@42 1031 T3f = FNMS(KP951056516, T3e, KP587785252 * T37);
Chris@42 1032 T3N = FNMS(KP951056516, T3M, KP587785252 * T3L);
Chris@42 1033 T43 = FMA(KP951056516, T3L, KP587785252 * T3M);
Chris@42 1034 T3Z = FMA(KP951056516, T37, KP587785252 * T3e);
Chris@42 1035 {
Chris@42 1036 E T3I, T3J, T3y, T3z;
Chris@42 1037 T3I = FNMS(KP250000000, T3H, T3E);
Chris@42 1038 T3J = KP559016994 * (T3F - T3G);
Chris@42 1039 T3K = T3I - T3J;
Chris@42 1040 T42 = T3J + T3I;
Chris@42 1041 T3y = FNMS(KP250000000, T3x, T3i);
Chris@42 1042 T3z = KP559016994 * (T3p - T3w);
Chris@42 1043 T3A = T3y - T3z;
Chris@42 1044 T3Y = T3z + T3y;
Chris@42 1045 }
Chris@42 1046 {
Chris@42 1047 E T3B, T3O, T45, T46;
Chris@42 1048 T3B = T3f + T3A;
Chris@42 1049 T3O = T3K - T3N;
Chris@42 1050 cr[WS(rs, 3)] = FNMS(TH, T3O, TE * T3B);
Chris@42 1051 ci[WS(rs, 3)] = FMA(TE, T3O, TH * T3B);
Chris@42 1052 T45 = T3Z + T3Y;
Chris@42 1053 T46 = T42 - T43;
Chris@42 1054 cr[WS(rs, 19)] = FNMS(T1M, T46, T1L * T45);
Chris@42 1055 ci[WS(rs, 19)] = FMA(T1L, T46, T1M * T45);
Chris@42 1056 }
Chris@42 1057 {
Chris@42 1058 E T3S, T3W, T40, T44;
Chris@42 1059 T3S = T3A - T3f;
Chris@42 1060 T3W = T3K + T3N;
Chris@42 1061 cr[WS(rs, 7)] = FNMS(T3V, T3W, T3R * T3S);
Chris@42 1062 ci[WS(rs, 7)] = FMA(T3R, T3W, T3V * T3S);
Chris@42 1063 T40 = T3Y - T3Z;
Chris@42 1064 T44 = T42 + T43;
Chris@42 1065 cr[WS(rs, 11)] = FNMS(T41, T44, T3X * T40);
Chris@42 1066 ci[WS(rs, 11)] = FMA(T3X, T44, T41 * T40);
Chris@42 1067 }
Chris@42 1068 }
Chris@42 1069 }
Chris@42 1070 }
Chris@42 1071 }
Chris@42 1072 }
Chris@42 1073
Chris@42 1074 static const tw_instr twinstr[] = {
Chris@42 1075 {TW_CEXP, 1, 1},
Chris@42 1076 {TW_CEXP, 1, 3},
Chris@42 1077 {TW_CEXP, 1, 9},
Chris@42 1078 {TW_CEXP, 1, 19},
Chris@42 1079 {TW_NEXT, 1, 0}
Chris@42 1080 };
Chris@42 1081
Chris@42 1082 static const hc2hc_desc desc = { 20, "hb2_20", twinstr, &GENUS, {204, 92, 72, 0} };
Chris@42 1083
Chris@42 1084 void X(codelet_hb2_20) (planner *p) {
Chris@42 1085 X(khc2hc_register) (p, hb2_20, &desc);
Chris@42 1086 }
Chris@42 1087 #endif /* HAVE_FMA */