annotate src/fftw-3.3.5/rdft/scalar/r2cb/hb_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:50:06 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hb_20 -include hb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 246 FP additions, 148 FP multiplications,
Chris@42 32 * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
Chris@42 33 * 101 stack variables, 4 constants, and 80 memory accesses
Chris@42 34 */
Chris@42 35 #include "hb.h"
Chris@42 36
Chris@42 37 static void hb_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 42 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 46 E T1T, T1Q, T1P;
Chris@42 47 {
Chris@42 48 E T2W, T4e, T7, TE, T3z, T4z, T1t, T2l, T3a, T3G, T13, T33, T3H, T1i, T2g;
Chris@42 49 E T4H, T4G, T2d, T1B, T4u, T4B, T4A, T4r, T1A, T2s, T3l, T2t, T3s, T2o, T2q;
Chris@42 50 E T1w, T1y, TC, T29, T3E, T3C, T4n, T4l, TN, TL;
Chris@42 51 {
Chris@42 52 E T4, T2U, T3, T2V, T1s, T5, T1n, T1o;
Chris@42 53 {
Chris@42 54 E T1, T2, T1q, T1r;
Chris@42 55 T1 = cr[0];
Chris@42 56 T2 = ci[WS(rs, 9)];
Chris@42 57 T1q = ci[WS(rs, 14)];
Chris@42 58 T1r = cr[WS(rs, 15)];
Chris@42 59 T4 = cr[WS(rs, 5)];
Chris@42 60 T2U = T1 - T2;
Chris@42 61 T3 = T1 + T2;
Chris@42 62 T2V = T1q + T1r;
Chris@42 63 T1s = T1q - T1r;
Chris@42 64 T5 = ci[WS(rs, 4)];
Chris@42 65 T1n = ci[WS(rs, 19)];
Chris@42 66 T1o = cr[WS(rs, 10)];
Chris@42 67 }
Chris@42 68 {
Chris@42 69 E T3y, T6, T3x, T1p;
Chris@42 70 T2W = T2U + T2V;
Chris@42 71 T4e = T2U - T2V;
Chris@42 72 T3y = T4 - T5;
Chris@42 73 T6 = T4 + T5;
Chris@42 74 T3x = T1n + T1o;
Chris@42 75 T1p = T1n - T1o;
Chris@42 76 T7 = T3 + T6;
Chris@42 77 TE = T3 - T6;
Chris@42 78 T3z = T3x - T3y;
Chris@42 79 T4z = T3y + T3x;
Chris@42 80 T1t = T1p - T1s;
Chris@42 81 T2l = T1p + T1s;
Chris@42 82 }
Chris@42 83 }
Chris@42 84 {
Chris@42 85 E T2Z, T4f, Te, TF, T3o, T4p, T1a, T2b, TJ, TA, T4t, T3k, T4j, T39, T2f;
Chris@42 86 E T12, T32, T4g, Tl, TG, T3r, T4q, T1h, T2c, T36, T4i, Tt, TI, T3h, T4s;
Chris@42 87 E TV, T2e;
Chris@42 88 {
Chris@42 89 E Tb, T2X, Ta, T2Y, T19, Tc, T14, T15;
Chris@42 90 {
Chris@42 91 E T8, T9, T17, T18;
Chris@42 92 T8 = cr[WS(rs, 4)];
Chris@42 93 T9 = ci[WS(rs, 5)];
Chris@42 94 T17 = ci[WS(rs, 10)];
Chris@42 95 T18 = cr[WS(rs, 19)];
Chris@42 96 Tb = cr[WS(rs, 9)];
Chris@42 97 T2X = T8 - T9;
Chris@42 98 Ta = T8 + T9;
Chris@42 99 T2Y = T17 + T18;
Chris@42 100 T19 = T17 - T18;
Chris@42 101 Tc = ci[0];
Chris@42 102 T14 = ci[WS(rs, 15)];
Chris@42 103 T15 = cr[WS(rs, 14)];
Chris@42 104 }
Chris@42 105 {
Chris@42 106 E T3n, Td, T3m, T16;
Chris@42 107 T2Z = T2X + T2Y;
Chris@42 108 T4f = T2X - T2Y;
Chris@42 109 T3n = Tb - Tc;
Chris@42 110 Td = Tb + Tc;
Chris@42 111 T3m = T14 + T15;
Chris@42 112 T16 = T14 - T15;
Chris@42 113 Te = Ta + Td;
Chris@42 114 TF = Ta - Td;
Chris@42 115 T3o = T3m - T3n;
Chris@42 116 T4p = T3n + T3m;
Chris@42 117 T1a = T16 - T19;
Chris@42 118 T2b = T16 + T19;
Chris@42 119 }
Chris@42 120 }
Chris@42 121 {
Chris@42 122 E TW, T37, Tw, T3i, Tz, TX, TZ, T10;
Chris@42 123 {
Chris@42 124 E Tu, Tv, Tx, Ty;
Chris@42 125 Tu = ci[WS(rs, 7)];
Chris@42 126 Tv = cr[WS(rs, 2)];
Chris@42 127 Tx = ci[WS(rs, 2)];
Chris@42 128 Ty = cr[WS(rs, 7)];
Chris@42 129 TW = ci[WS(rs, 17)];
Chris@42 130 T37 = Tu - Tv;
Chris@42 131 Tw = Tu + Tv;
Chris@42 132 T3i = Tx - Ty;
Chris@42 133 Tz = Tx + Ty;
Chris@42 134 TX = cr[WS(rs, 12)];
Chris@42 135 TZ = ci[WS(rs, 12)];
Chris@42 136 T10 = cr[WS(rs, 17)];
Chris@42 137 }
Chris@42 138 {
Chris@42 139 E TY, T38, T11, T3j;
Chris@42 140 TJ = Tw - Tz;
Chris@42 141 TA = Tw + Tz;
Chris@42 142 T3j = TW + TX;
Chris@42 143 TY = TW - TX;
Chris@42 144 T38 = TZ + T10;
Chris@42 145 T11 = TZ - T10;
Chris@42 146 T4t = T3i - T3j;
Chris@42 147 T3k = T3i + T3j;
Chris@42 148 T4j = T37 + T38;
Chris@42 149 T39 = T37 - T38;
Chris@42 150 T2f = TY + T11;
Chris@42 151 T12 = TY - T11;
Chris@42 152 }
Chris@42 153 }
Chris@42 154 {
Chris@42 155 E Ti, T30, Th, T31, T1g, Tj, T1b, T1c;
Chris@42 156 {
Chris@42 157 E Tf, Tg, T1e, T1f;
Chris@42 158 Tf = ci[WS(rs, 3)];
Chris@42 159 Tg = cr[WS(rs, 6)];
Chris@42 160 T1e = ci[WS(rs, 18)];
Chris@42 161 T1f = cr[WS(rs, 11)];
Chris@42 162 Ti = cr[WS(rs, 1)];
Chris@42 163 T30 = Tf - Tg;
Chris@42 164 Th = Tf + Tg;
Chris@42 165 T31 = T1e + T1f;
Chris@42 166 T1g = T1e - T1f;
Chris@42 167 Tj = ci[WS(rs, 8)];
Chris@42 168 T1b = ci[WS(rs, 13)];
Chris@42 169 T1c = cr[WS(rs, 16)];
Chris@42 170 }
Chris@42 171 {
Chris@42 172 E T3p, Tk, T3q, T1d;
Chris@42 173 T32 = T30 + T31;
Chris@42 174 T4g = T30 - T31;
Chris@42 175 T3p = Ti - Tj;
Chris@42 176 Tk = Ti + Tj;
Chris@42 177 T3q = T1b + T1c;
Chris@42 178 T1d = T1b - T1c;
Chris@42 179 Tl = Th + Tk;
Chris@42 180 TG = Th - Tk;
Chris@42 181 T3r = T3p + T3q;
Chris@42 182 T4q = T3p - T3q;
Chris@42 183 T1h = T1d - T1g;
Chris@42 184 T2c = T1d + T1g;
Chris@42 185 }
Chris@42 186 }
Chris@42 187 {
Chris@42 188 E Tq, T34, Tp, T35, TU, Tr, TP, TQ;
Chris@42 189 {
Chris@42 190 E Tn, To, TS, TT;
Chris@42 191 Tn = cr[WS(rs, 8)];
Chris@42 192 To = ci[WS(rs, 1)];
Chris@42 193 TS = ci[WS(rs, 16)];
Chris@42 194 TT = cr[WS(rs, 13)];
Chris@42 195 Tq = ci[WS(rs, 6)];
Chris@42 196 T34 = Tn - To;
Chris@42 197 Tp = Tn + To;
Chris@42 198 T35 = TS + TT;
Chris@42 199 TU = TS - TT;
Chris@42 200 Tr = cr[WS(rs, 3)];
Chris@42 201 TP = ci[WS(rs, 11)];
Chris@42 202 TQ = cr[WS(rs, 18)];
Chris@42 203 }
Chris@42 204 {
Chris@42 205 E T3g, Ts, T3f, TR;
Chris@42 206 T36 = T34 - T35;
Chris@42 207 T4i = T34 + T35;
Chris@42 208 T3g = Tq - Tr;
Chris@42 209 Ts = Tq + Tr;
Chris@42 210 T3f = TP + TQ;
Chris@42 211 TR = TP - TQ;
Chris@42 212 Tt = Tp + Ts;
Chris@42 213 TI = Tp - Ts;
Chris@42 214 T3h = T3f - T3g;
Chris@42 215 T4s = T3g + T3f;
Chris@42 216 TV = TR - TU;
Chris@42 217 T2e = TR + TU;
Chris@42 218 }
Chris@42 219 }
Chris@42 220 {
Chris@42 221 E T1v, T1u, T2n, T4k, T4h, T2m, TH, TK;
Chris@42 222 T3a = T36 + T39;
Chris@42 223 T3G = T36 - T39;
Chris@42 224 T13 = TV - T12;
Chris@42 225 T1v = TV + T12;
Chris@42 226 T33 = T2Z + T32;
Chris@42 227 T3H = T2Z - T32;
Chris@42 228 T1i = T1a - T1h;
Chris@42 229 T1u = T1a + T1h;
Chris@42 230 T2n = T2e + T2f;
Chris@42 231 T2g = T2e - T2f;
Chris@42 232 T4H = T4i - T4j;
Chris@42 233 T4k = T4i + T4j;
Chris@42 234 T4h = T4f + T4g;
Chris@42 235 T4G = T4f - T4g;
Chris@42 236 T2d = T2b - T2c;
Chris@42 237 T2m = T2b + T2c;
Chris@42 238 TH = TF + TG;
Chris@42 239 T1B = TF - TG;
Chris@42 240 T4u = T4s - T4t;
Chris@42 241 T4B = T4s + T4t;
Chris@42 242 T4A = T4p + T4q;
Chris@42 243 T4r = T4p - T4q;
Chris@42 244 T1A = TI - TJ;
Chris@42 245 TK = TI + TJ;
Chris@42 246 {
Chris@42 247 E Tm, T3B, TB, T3A;
Chris@42 248 Tm = Te + Tl;
Chris@42 249 T2s = Te - Tl;
Chris@42 250 T3l = T3h + T3k;
Chris@42 251 T3B = T3h - T3k;
Chris@42 252 TB = Tt + TA;
Chris@42 253 T2t = Tt - TA;
Chris@42 254 T3s = T3o + T3r;
Chris@42 255 T3A = T3o - T3r;
Chris@42 256 T2o = T2m + T2n;
Chris@42 257 T2q = T2m - T2n;
Chris@42 258 T1w = T1u + T1v;
Chris@42 259 T1y = T1u - T1v;
Chris@42 260 TC = Tm + TB;
Chris@42 261 T29 = Tm - TB;
Chris@42 262 T3E = T3A - T3B;
Chris@42 263 T3C = T3A + T3B;
Chris@42 264 T4n = T4h - T4k;
Chris@42 265 T4l = T4h + T4k;
Chris@42 266 TN = TH - TK;
Chris@42 267 TL = TH + TK;
Chris@42 268 }
Chris@42 269 }
Chris@42 270 }
Chris@42 271 {
Chris@42 272 E T3d, T3b, T4E, T1x, TM, T4m, T58, T5b, T4D, T5a, T5c, T59, T4C;
Chris@42 273 cr[0] = T7 + TC;
Chris@42 274 T3d = T33 - T3a;
Chris@42 275 T3b = T33 + T3a;
Chris@42 276 T4E = T4A - T4B;
Chris@42 277 T4C = T4A + T4B;
Chris@42 278 ci[0] = T2l + T2o;
Chris@42 279 {
Chris@42 280 E T25, T22, T21, T24, T23, T26, T57;
Chris@42 281 T1x = FNMS(KP250000000, T1w, T1t);
Chris@42 282 T25 = T1t + T1w;
Chris@42 283 T22 = TE + TL;
Chris@42 284 TM = FNMS(KP250000000, TL, TE);
Chris@42 285 T21 = W[18];
Chris@42 286 T24 = W[19];
Chris@42 287 T4m = FNMS(KP250000000, T4l, T4e);
Chris@42 288 T58 = T4e + T4l;
Chris@42 289 T5b = T4z + T4C;
Chris@42 290 T4D = FNMS(KP250000000, T4C, T4z);
Chris@42 291 T23 = T21 * T22;
Chris@42 292 T26 = T24 * T22;
Chris@42 293 T57 = W[8];
Chris@42 294 T5a = W[9];
Chris@42 295 cr[WS(rs, 10)] = FNMS(T24, T25, T23);
Chris@42 296 ci[WS(rs, 10)] = FMA(T21, T25, T26);
Chris@42 297 T5c = T57 * T5b;
Chris@42 298 T59 = T57 * T58;
Chris@42 299 }
Chris@42 300 {
Chris@42 301 E T3U, T3Z, T3W, T40, T3V;
Chris@42 302 {
Chris@42 303 E T3c, T48, T4b, T3D, T47, T4a;
Chris@42 304 T3c = FNMS(KP250000000, T3b, T2W);
Chris@42 305 T48 = T2W + T3b;
Chris@42 306 T4b = T3z + T3C;
Chris@42 307 T3D = FNMS(KP250000000, T3C, T3z);
Chris@42 308 ci[WS(rs, 5)] = FMA(T5a, T58, T5c);
Chris@42 309 cr[WS(rs, 5)] = FNMS(T5a, T5b, T59);
Chris@42 310 T47 = W[28];
Chris@42 311 T4a = W[29];
Chris@42 312 {
Chris@42 313 E T3I, T3Y, T42, T3u, T3M, T3X, T3F;
Chris@42 314 {
Chris@42 315 E T3T, T3t, T4c, T49, T3e, T3S;
Chris@42 316 T3T = FMA(KP618033988, T3l, T3s);
Chris@42 317 T3t = FNMS(KP618033988, T3s, T3l);
Chris@42 318 T4c = T47 * T4b;
Chris@42 319 T49 = T47 * T48;
Chris@42 320 T3I = FNMS(KP618033988, T3H, T3G);
Chris@42 321 T3Y = FMA(KP618033988, T3G, T3H);
Chris@42 322 ci[WS(rs, 15)] = FMA(T4a, T48, T4c);
Chris@42 323 cr[WS(rs, 15)] = FNMS(T4a, T4b, T49);
Chris@42 324 T3e = FNMS(KP559016994, T3d, T3c);
Chris@42 325 T3S = FMA(KP559016994, T3d, T3c);
Chris@42 326 T42 = FMA(KP951056516, T3T, T3S);
Chris@42 327 T3U = FNMS(KP951056516, T3T, T3S);
Chris@42 328 T3u = FNMS(KP951056516, T3t, T3e);
Chris@42 329 T3M = FMA(KP951056516, T3t, T3e);
Chris@42 330 T3X = FMA(KP559016994, T3E, T3D);
Chris@42 331 T3F = FNMS(KP559016994, T3E, T3D);
Chris@42 332 }
Chris@42 333 {
Chris@42 334 E T3P, T45, T44, T46, T43;
Chris@42 335 {
Chris@42 336 E T3w, T3J, T3v, T3K, T2T, T41;
Chris@42 337 T2T = W[4];
Chris@42 338 T3w = W[5];
Chris@42 339 T3J = FMA(KP951056516, T3I, T3F);
Chris@42 340 T3P = FNMS(KP951056516, T3I, T3F);
Chris@42 341 T45 = FNMS(KP951056516, T3Y, T3X);
Chris@42 342 T3Z = FMA(KP951056516, T3Y, T3X);
Chris@42 343 T3v = T2T * T3u;
Chris@42 344 T3K = T2T * T3J;
Chris@42 345 T41 = W[36];
Chris@42 346 T44 = W[37];
Chris@42 347 cr[WS(rs, 3)] = FNMS(T3w, T3J, T3v);
Chris@42 348 ci[WS(rs, 3)] = FMA(T3w, T3u, T3K);
Chris@42 349 T46 = T41 * T45;
Chris@42 350 T43 = T41 * T42;
Chris@42 351 }
Chris@42 352 {
Chris@42 353 E T3O, T3Q, T3N, T3L, T3R;
Chris@42 354 T3L = W[12];
Chris@42 355 T3O = W[13];
Chris@42 356 ci[WS(rs, 19)] = FMA(T44, T42, T46);
Chris@42 357 cr[WS(rs, 19)] = FNMS(T44, T45, T43);
Chris@42 358 T3Q = T3L * T3P;
Chris@42 359 T3N = T3L * T3M;
Chris@42 360 T3R = W[20];
Chris@42 361 T3W = W[21];
Chris@42 362 ci[WS(rs, 7)] = FMA(T3O, T3M, T3Q);
Chris@42 363 cr[WS(rs, 7)] = FNMS(T3O, T3P, T3N);
Chris@42 364 T40 = T3R * T3Z;
Chris@42 365 T3V = T3R * T3U;
Chris@42 366 }
Chris@42 367 }
Chris@42 368 }
Chris@42 369 }
Chris@42 370 {
Chris@42 371 E T4U, T4Z, T4W, T50, T4V, T2L, T2I, T2H;
Chris@42 372 {
Chris@42 373 E T4T, T4v, T4I, T4Y, T4o, T4S;
Chris@42 374 T4T = FNMS(KP618033988, T4r, T4u);
Chris@42 375 T4v = FMA(KP618033988, T4u, T4r);
Chris@42 376 ci[WS(rs, 11)] = FMA(T3W, T3U, T40);
Chris@42 377 cr[WS(rs, 11)] = FNMS(T3W, T3Z, T3V);
Chris@42 378 T4I = FMA(KP618033988, T4H, T4G);
Chris@42 379 T4Y = FNMS(KP618033988, T4G, T4H);
Chris@42 380 T4o = FMA(KP559016994, T4n, T4m);
Chris@42 381 T4S = FNMS(KP559016994, T4n, T4m);
Chris@42 382 {
Chris@42 383 E T52, T4M, T55, T4P, T54, T56, T53;
Chris@42 384 {
Chris@42 385 E T4d, T4w, T4J, T4x, T4y, T4X, T4F, T51, T4K;
Chris@42 386 T4d = W[0];
Chris@42 387 T4X = FNMS(KP559016994, T4E, T4D);
Chris@42 388 T4F = FMA(KP559016994, T4E, T4D);
Chris@42 389 T4U = FNMS(KP951056516, T4T, T4S);
Chris@42 390 T52 = FMA(KP951056516, T4T, T4S);
Chris@42 391 T4M = FMA(KP951056516, T4v, T4o);
Chris@42 392 T4w = FNMS(KP951056516, T4v, T4o);
Chris@42 393 T4Z = FMA(KP951056516, T4Y, T4X);
Chris@42 394 T55 = FNMS(KP951056516, T4Y, T4X);
Chris@42 395 T4P = FNMS(KP951056516, T4I, T4F);
Chris@42 396 T4J = FMA(KP951056516, T4I, T4F);
Chris@42 397 T4x = T4d * T4w;
Chris@42 398 T4y = W[1];
Chris@42 399 T51 = W[32];
Chris@42 400 T4K = T4d * T4J;
Chris@42 401 T54 = W[33];
Chris@42 402 cr[WS(rs, 1)] = FNMS(T4y, T4J, T4x);
Chris@42 403 T56 = T51 * T55;
Chris@42 404 T53 = T51 * T52;
Chris@42 405 ci[WS(rs, 1)] = FMA(T4y, T4w, T4K);
Chris@42 406 }
Chris@42 407 {
Chris@42 408 E T4O, T4Q, T4N, T4L, T4R;
Chris@42 409 T4L = W[16];
Chris@42 410 ci[WS(rs, 17)] = FMA(T54, T52, T56);
Chris@42 411 cr[WS(rs, 17)] = FNMS(T54, T55, T53);
Chris@42 412 T4O = W[17];
Chris@42 413 T4Q = T4L * T4P;
Chris@42 414 T4N = T4L * T4M;
Chris@42 415 T4R = W[24];
Chris@42 416 T4W = W[25];
Chris@42 417 ci[WS(rs, 9)] = FMA(T4O, T4M, T4Q);
Chris@42 418 cr[WS(rs, 9)] = FNMS(T4O, T4P, T4N);
Chris@42 419 T50 = T4R * T4Z;
Chris@42 420 T4V = T4R * T4U;
Chris@42 421 }
Chris@42 422 }
Chris@42 423 }
Chris@42 424 {
Chris@42 425 E T2K, T2u, T2F, T2h, T28, T2J, T2r, T2p;
Chris@42 426 T2K = FNMS(KP618033988, T2s, T2t);
Chris@42 427 T2u = FMA(KP618033988, T2t, T2s);
Chris@42 428 ci[WS(rs, 13)] = FMA(T4W, T4U, T50);
Chris@42 429 cr[WS(rs, 13)] = FNMS(T4W, T4Z, T4V);
Chris@42 430 T2p = FNMS(KP250000000, T2o, T2l);
Chris@42 431 T2F = FNMS(KP618033988, T2d, T2g);
Chris@42 432 T2h = FMA(KP618033988, T2g, T2d);
Chris@42 433 T28 = FNMS(KP250000000, TC, T7);
Chris@42 434 T2J = FNMS(KP559016994, T2q, T2p);
Chris@42 435 T2r = FMA(KP559016994, T2q, T2p);
Chris@42 436 {
Chris@42 437 E T2B, T2G, T2y, T2R, T2Q, T2P, T2A, T2x;
Chris@42 438 {
Chris@42 439 E T2k, T2v, T27, T2O, T2i, T2a, T2E;
Chris@42 440 T2k = W[7];
Chris@42 441 T2a = FMA(KP559016994, T29, T28);
Chris@42 442 T2E = FNMS(KP559016994, T29, T28);
Chris@42 443 T2B = FMA(KP951056516, T2u, T2r);
Chris@42 444 T2v = FNMS(KP951056516, T2u, T2r);
Chris@42 445 T27 = W[6];
Chris@42 446 T2O = FMA(KP951056516, T2F, T2E);
Chris@42 447 T2G = FNMS(KP951056516, T2F, T2E);
Chris@42 448 T2i = FMA(KP951056516, T2h, T2a);
Chris@42 449 T2y = FNMS(KP951056516, T2h, T2a);
Chris@42 450 {
Chris@42 451 E T2N, T2j, T2w, T2S;
Chris@42 452 T2L = FMA(KP951056516, T2K, T2J);
Chris@42 453 T2R = FNMS(KP951056516, T2K, T2J);
Chris@42 454 T2Q = W[23];
Chris@42 455 T2N = W[22];
Chris@42 456 T2j = T27 * T2i;
Chris@42 457 T2w = T2k * T2i;
Chris@42 458 T2S = T2Q * T2O;
Chris@42 459 T2P = T2N * T2O;
Chris@42 460 cr[WS(rs, 4)] = FNMS(T2k, T2v, T2j);
Chris@42 461 ci[WS(rs, 4)] = FMA(T27, T2v, T2w);
Chris@42 462 ci[WS(rs, 12)] = FMA(T2N, T2R, T2S);
Chris@42 463 }
Chris@42 464 }
Chris@42 465 cr[WS(rs, 12)] = FNMS(T2Q, T2R, T2P);
Chris@42 466 T2A = W[31];
Chris@42 467 T2x = W[30];
Chris@42 468 {
Chris@42 469 E T2D, T2M, T2C, T2z;
Chris@42 470 T2I = W[15];
Chris@42 471 T2C = T2A * T2y;
Chris@42 472 T2z = T2x * T2y;
Chris@42 473 T2D = W[14];
Chris@42 474 T2M = T2I * T2G;
Chris@42 475 ci[WS(rs, 16)] = FMA(T2x, T2B, T2C);
Chris@42 476 cr[WS(rs, 16)] = FNMS(T2A, T2B, T2z);
Chris@42 477 T2H = T2D * T2G;
Chris@42 478 ci[WS(rs, 8)] = FMA(T2D, T2L, T2M);
Chris@42 479 }
Chris@42 480 }
Chris@42 481 }
Chris@42 482 {
Chris@42 483 E T1S, T1C, T1j, T1N, T1z, T1R;
Chris@42 484 T1S = FMA(KP618033988, T1A, T1B);
Chris@42 485 T1C = FNMS(KP618033988, T1B, T1A);
Chris@42 486 cr[WS(rs, 8)] = FNMS(T2I, T2L, T2H);
Chris@42 487 T1j = FNMS(KP618033988, T1i, T13);
Chris@42 488 T1N = FMA(KP618033988, T13, T1i);
Chris@42 489 T1z = FNMS(KP559016994, T1y, T1x);
Chris@42 490 T1R = FMA(KP559016994, T1y, T1x);
Chris@42 491 {
Chris@42 492 E T1J, T1O, T1G, T1Z, T1Y, T1X, T1I, T1F;
Chris@42 493 {
Chris@42 494 E T1m, T1D, TD, T1W, T1k, T1M, TO;
Chris@42 495 T1m = W[3];
Chris@42 496 T1M = FMA(KP559016994, TN, TM);
Chris@42 497 TO = FNMS(KP559016994, TN, TM);
Chris@42 498 T1D = FNMS(KP951056516, T1C, T1z);
Chris@42 499 T1J = FMA(KP951056516, T1C, T1z);
Chris@42 500 TD = W[2];
Chris@42 501 T1O = FNMS(KP951056516, T1N, T1M);
Chris@42 502 T1W = FMA(KP951056516, T1N, T1M);
Chris@42 503 T1G = FNMS(KP951056516, T1j, TO);
Chris@42 504 T1k = FMA(KP951056516, T1j, TO);
Chris@42 505 {
Chris@42 506 E T1V, T1l, T1E, T20;
Chris@42 507 T1Z = FNMS(KP951056516, T1S, T1R);
Chris@42 508 T1T = FMA(KP951056516, T1S, T1R);
Chris@42 509 T1Y = W[27];
Chris@42 510 T1V = W[26];
Chris@42 511 T1l = TD * T1k;
Chris@42 512 T1E = T1m * T1k;
Chris@42 513 T20 = T1Y * T1W;
Chris@42 514 T1X = T1V * T1W;
Chris@42 515 cr[WS(rs, 2)] = FNMS(T1m, T1D, T1l);
Chris@42 516 ci[WS(rs, 2)] = FMA(TD, T1D, T1E);
Chris@42 517 ci[WS(rs, 14)] = FMA(T1V, T1Z, T20);
Chris@42 518 }
Chris@42 519 }
Chris@42 520 cr[WS(rs, 14)] = FNMS(T1Y, T1Z, T1X);
Chris@42 521 T1I = W[35];
Chris@42 522 T1F = W[34];
Chris@42 523 {
Chris@42 524 E T1L, T1U, T1K, T1H;
Chris@42 525 T1Q = W[11];
Chris@42 526 T1K = T1I * T1G;
Chris@42 527 T1H = T1F * T1G;
Chris@42 528 T1L = W[10];
Chris@42 529 T1U = T1Q * T1O;
Chris@42 530 ci[WS(rs, 18)] = FMA(T1F, T1J, T1K);
Chris@42 531 cr[WS(rs, 18)] = FNMS(T1I, T1J, T1H);
Chris@42 532 T1P = T1L * T1O;
Chris@42 533 ci[WS(rs, 6)] = FMA(T1L, T1T, T1U);
Chris@42 534 }
Chris@42 535 }
Chris@42 536 }
Chris@42 537 }
Chris@42 538 }
Chris@42 539 }
Chris@42 540 }
Chris@42 541 cr[WS(rs, 6)] = FNMS(T1Q, T1T, T1P);
Chris@42 542 }
Chris@42 543 }
Chris@42 544 }
Chris@42 545
Chris@42 546 static const tw_instr twinstr[] = {
Chris@42 547 {TW_FULL, 1, 20},
Chris@42 548 {TW_NEXT, 1, 0}
Chris@42 549 };
Chris@42 550
Chris@42 551 static const hc2hc_desc desc = { 20, "hb_20", twinstr, &GENUS, {136, 38, 110, 0} };
Chris@42 552
Chris@42 553 void X(codelet_hb_20) (planner *p) {
Chris@42 554 X(khc2hc_register) (p, hb_20, &desc);
Chris@42 555 }
Chris@42 556 #else /* HAVE_FMA */
Chris@42 557
Chris@42 558 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hb_20 -include hb.h */
Chris@42 559
Chris@42 560 /*
Chris@42 561 * This function contains 246 FP additions, 124 FP multiplications,
Chris@42 562 * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
Chris@42 563 * 97 stack variables, 4 constants, and 80 memory accesses
Chris@42 564 */
Chris@42 565 #include "hb.h"
Chris@42 566
Chris@42 567 static void hb_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 568 {
Chris@42 569 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 570 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 571 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 572 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 573 {
Chris@42 574 INT m;
Chris@42 575 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 576 E T7, T3T, T49, TE, T1v, T2T, T3g, T2d, T13, T3n, T3o, T1i, T26, T4e, T4d;
Chris@42 577 E T23, T1n, T42, T3Z, T1m, T2h, T2I, T2i, T2P, T30, T37, T38, Tm, TB, TC;
Chris@42 578 E T46, T47, T4a, T2a, T2b, T2e, T1w, T1x, T1y, T3O, T3R, T3U, T3h, T3i, T3j;
Chris@42 579 E TH, TK, TL;
Chris@42 580 {
Chris@42 581 E T3, T2R, T1u, T2S, T6, T3f, T1r, T3e;
Chris@42 582 {
Chris@42 583 E T1, T2, T1s, T1t;
Chris@42 584 T1 = cr[0];
Chris@42 585 T2 = ci[WS(rs, 9)];
Chris@42 586 T3 = T1 + T2;
Chris@42 587 T2R = T1 - T2;
Chris@42 588 T1s = ci[WS(rs, 14)];
Chris@42 589 T1t = cr[WS(rs, 15)];
Chris@42 590 T1u = T1s - T1t;
Chris@42 591 T2S = T1s + T1t;
Chris@42 592 }
Chris@42 593 {
Chris@42 594 E T4, T5, T1p, T1q;
Chris@42 595 T4 = cr[WS(rs, 5)];
Chris@42 596 T5 = ci[WS(rs, 4)];
Chris@42 597 T6 = T4 + T5;
Chris@42 598 T3f = T4 - T5;
Chris@42 599 T1p = ci[WS(rs, 19)];
Chris@42 600 T1q = cr[WS(rs, 10)];
Chris@42 601 T1r = T1p - T1q;
Chris@42 602 T3e = T1p + T1q;
Chris@42 603 }
Chris@42 604 T7 = T3 + T6;
Chris@42 605 T3T = T2R - T2S;
Chris@42 606 T49 = T3f + T3e;
Chris@42 607 TE = T3 - T6;
Chris@42 608 T1v = T1r - T1u;
Chris@42 609 T2T = T2R + T2S;
Chris@42 610 T3g = T3e - T3f;
Chris@42 611 T2d = T1r + T1u;
Chris@42 612 }
Chris@42 613 {
Chris@42 614 E Te, T3M, T3X, TF, TV, T2E, T2W, T21, TA, T3Q, T41, TJ, T1h, T2O, T36;
Chris@42 615 E T25, Tl, T3N, T3Y, TG, T12, T2H, T2Z, T22, Tt, T3P, T40, TI, T1a, T2L;
Chris@42 616 E T33, T24;
Chris@42 617 {
Chris@42 618 E Ta, T2U, TU, T2V, Td, T2D, TR, T2C;
Chris@42 619 {
Chris@42 620 E T8, T9, TS, TT;
Chris@42 621 T8 = cr[WS(rs, 4)];
Chris@42 622 T9 = ci[WS(rs, 5)];
Chris@42 623 Ta = T8 + T9;
Chris@42 624 T2U = T8 - T9;
Chris@42 625 TS = ci[WS(rs, 10)];
Chris@42 626 TT = cr[WS(rs, 19)];
Chris@42 627 TU = TS - TT;
Chris@42 628 T2V = TS + TT;
Chris@42 629 }
Chris@42 630 {
Chris@42 631 E Tb, Tc, TP, TQ;
Chris@42 632 Tb = cr[WS(rs, 9)];
Chris@42 633 Tc = ci[0];
Chris@42 634 Td = Tb + Tc;
Chris@42 635 T2D = Tb - Tc;
Chris@42 636 TP = ci[WS(rs, 15)];
Chris@42 637 TQ = cr[WS(rs, 14)];
Chris@42 638 TR = TP - TQ;
Chris@42 639 T2C = TP + TQ;
Chris@42 640 }
Chris@42 641 Te = Ta + Td;
Chris@42 642 T3M = T2U - T2V;
Chris@42 643 T3X = T2D + T2C;
Chris@42 644 TF = Ta - Td;
Chris@42 645 TV = TR - TU;
Chris@42 646 T2E = T2C - T2D;
Chris@42 647 T2W = T2U + T2V;
Chris@42 648 T21 = TR + TU;
Chris@42 649 }
Chris@42 650 {
Chris@42 651 E Tw, T34, Tz, T2M, T1d, T2N, T1g, T35;
Chris@42 652 {
Chris@42 653 E Tu, Tv, Tx, Ty;
Chris@42 654 Tu = ci[WS(rs, 7)];
Chris@42 655 Tv = cr[WS(rs, 2)];
Chris@42 656 Tw = Tu + Tv;
Chris@42 657 T34 = Tu - Tv;
Chris@42 658 Tx = ci[WS(rs, 2)];
Chris@42 659 Ty = cr[WS(rs, 7)];
Chris@42 660 Tz = Tx + Ty;
Chris@42 661 T2M = Tx - Ty;
Chris@42 662 }
Chris@42 663 {
Chris@42 664 E T1b, T1c, T1e, T1f;
Chris@42 665 T1b = ci[WS(rs, 17)];
Chris@42 666 T1c = cr[WS(rs, 12)];
Chris@42 667 T1d = T1b - T1c;
Chris@42 668 T2N = T1b + T1c;
Chris@42 669 T1e = ci[WS(rs, 12)];
Chris@42 670 T1f = cr[WS(rs, 17)];
Chris@42 671 T1g = T1e - T1f;
Chris@42 672 T35 = T1e + T1f;
Chris@42 673 }
Chris@42 674 TA = Tw + Tz;
Chris@42 675 T3Q = T34 + T35;
Chris@42 676 T41 = T2M - T2N;
Chris@42 677 TJ = Tw - Tz;
Chris@42 678 T1h = T1d - T1g;
Chris@42 679 T2O = T2M + T2N;
Chris@42 680 T36 = T34 - T35;
Chris@42 681 T25 = T1d + T1g;
Chris@42 682 }
Chris@42 683 {
Chris@42 684 E Th, T2X, T11, T2Y, Tk, T2F, TY, T2G;
Chris@42 685 {
Chris@42 686 E Tf, Tg, TZ, T10;
Chris@42 687 Tf = ci[WS(rs, 3)];
Chris@42 688 Tg = cr[WS(rs, 6)];
Chris@42 689 Th = Tf + Tg;
Chris@42 690 T2X = Tf - Tg;
Chris@42 691 TZ = ci[WS(rs, 18)];
Chris@42 692 T10 = cr[WS(rs, 11)];
Chris@42 693 T11 = TZ - T10;
Chris@42 694 T2Y = TZ + T10;
Chris@42 695 }
Chris@42 696 {
Chris@42 697 E Ti, Tj, TW, TX;
Chris@42 698 Ti = cr[WS(rs, 1)];
Chris@42 699 Tj = ci[WS(rs, 8)];
Chris@42 700 Tk = Ti + Tj;
Chris@42 701 T2F = Ti - Tj;
Chris@42 702 TW = ci[WS(rs, 13)];
Chris@42 703 TX = cr[WS(rs, 16)];
Chris@42 704 TY = TW - TX;
Chris@42 705 T2G = TW + TX;
Chris@42 706 }
Chris@42 707 Tl = Th + Tk;
Chris@42 708 T3N = T2X - T2Y;
Chris@42 709 T3Y = T2F - T2G;
Chris@42 710 TG = Th - Tk;
Chris@42 711 T12 = TY - T11;
Chris@42 712 T2H = T2F + T2G;
Chris@42 713 T2Z = T2X + T2Y;
Chris@42 714 T22 = TY + T11;
Chris@42 715 }
Chris@42 716 {
Chris@42 717 E Tp, T31, T19, T32, Ts, T2K, T16, T2J;
Chris@42 718 {
Chris@42 719 E Tn, To, T17, T18;
Chris@42 720 Tn = cr[WS(rs, 8)];
Chris@42 721 To = ci[WS(rs, 1)];
Chris@42 722 Tp = Tn + To;
Chris@42 723 T31 = Tn - To;
Chris@42 724 T17 = ci[WS(rs, 16)];
Chris@42 725 T18 = cr[WS(rs, 13)];
Chris@42 726 T19 = T17 - T18;
Chris@42 727 T32 = T17 + T18;
Chris@42 728 }
Chris@42 729 {
Chris@42 730 E Tq, Tr, T14, T15;
Chris@42 731 Tq = ci[WS(rs, 6)];
Chris@42 732 Tr = cr[WS(rs, 3)];
Chris@42 733 Ts = Tq + Tr;
Chris@42 734 T2K = Tq - Tr;
Chris@42 735 T14 = ci[WS(rs, 11)];
Chris@42 736 T15 = cr[WS(rs, 18)];
Chris@42 737 T16 = T14 - T15;
Chris@42 738 T2J = T14 + T15;
Chris@42 739 }
Chris@42 740 Tt = Tp + Ts;
Chris@42 741 T3P = T31 + T32;
Chris@42 742 T40 = T2K + T2J;
Chris@42 743 TI = Tp - Ts;
Chris@42 744 T1a = T16 - T19;
Chris@42 745 T2L = T2J - T2K;
Chris@42 746 T33 = T31 - T32;
Chris@42 747 T24 = T16 + T19;
Chris@42 748 }
Chris@42 749 T13 = TV - T12;
Chris@42 750 T3n = T2W - T2Z;
Chris@42 751 T3o = T33 - T36;
Chris@42 752 T1i = T1a - T1h;
Chris@42 753 T26 = T24 - T25;
Chris@42 754 T4e = T3P - T3Q;
Chris@42 755 T4d = T3M - T3N;
Chris@42 756 T23 = T21 - T22;
Chris@42 757 T1n = TI - TJ;
Chris@42 758 T42 = T40 - T41;
Chris@42 759 T3Z = T3X - T3Y;
Chris@42 760 T1m = TF - TG;
Chris@42 761 T2h = Te - Tl;
Chris@42 762 T2I = T2E + T2H;
Chris@42 763 T2i = Tt - TA;
Chris@42 764 T2P = T2L + T2O;
Chris@42 765 T30 = T2W + T2Z;
Chris@42 766 T37 = T33 + T36;
Chris@42 767 T38 = T30 + T37;
Chris@42 768 Tm = Te + Tl;
Chris@42 769 TB = Tt + TA;
Chris@42 770 TC = Tm + TB;
Chris@42 771 T46 = T3X + T3Y;
Chris@42 772 T47 = T40 + T41;
Chris@42 773 T4a = T46 + T47;
Chris@42 774 T2a = T21 + T22;
Chris@42 775 T2b = T24 + T25;
Chris@42 776 T2e = T2a + T2b;
Chris@42 777 T1w = TV + T12;
Chris@42 778 T1x = T1a + T1h;
Chris@42 779 T1y = T1w + T1x;
Chris@42 780 T3O = T3M + T3N;
Chris@42 781 T3R = T3P + T3Q;
Chris@42 782 T3U = T3O + T3R;
Chris@42 783 T3h = T2E - T2H;
Chris@42 784 T3i = T2L - T2O;
Chris@42 785 T3j = T3h + T3i;
Chris@42 786 TH = TF + TG;
Chris@42 787 TK = TI + TJ;
Chris@42 788 TL = TH + TK;
Chris@42 789 }
Chris@42 790 cr[0] = T7 + TC;
Chris@42 791 ci[0] = T2d + T2e;
Chris@42 792 {
Chris@42 793 E T1U, T1W, T1T, T1V;
Chris@42 794 T1U = TE + TL;
Chris@42 795 T1W = T1v + T1y;
Chris@42 796 T1T = W[18];
Chris@42 797 T1V = W[19];
Chris@42 798 cr[WS(rs, 10)] = FNMS(T1V, T1W, T1T * T1U);
Chris@42 799 ci[WS(rs, 10)] = FMA(T1V, T1U, T1T * T1W);
Chris@42 800 }
Chris@42 801 {
Chris@42 802 E T4y, T4A, T4x, T4z;
Chris@42 803 T4y = T3T + T3U;
Chris@42 804 T4A = T49 + T4a;
Chris@42 805 T4x = W[8];
Chris@42 806 T4z = W[9];
Chris@42 807 cr[WS(rs, 5)] = FNMS(T4z, T4A, T4x * T4y);
Chris@42 808 ci[WS(rs, 5)] = FMA(T4x, T4A, T4z * T4y);
Chris@42 809 }
Chris@42 810 {
Chris@42 811 E T3I, T3K, T3H, T3J;
Chris@42 812 T3I = T2T + T38;
Chris@42 813 T3K = T3g + T3j;
Chris@42 814 T3H = W[28];
Chris@42 815 T3J = W[29];
Chris@42 816 cr[WS(rs, 15)] = FNMS(T3J, T3K, T3H * T3I);
Chris@42 817 ci[WS(rs, 15)] = FMA(T3H, T3K, T3J * T3I);
Chris@42 818 }
Chris@42 819 {
Chris@42 820 E T27, T2j, T2v, T2r, T2g, T2u, T20, T2q;
Chris@42 821 T27 = FMA(KP951056516, T23, KP587785252 * T26);
Chris@42 822 T2j = FMA(KP951056516, T2h, KP587785252 * T2i);
Chris@42 823 T2v = FNMS(KP951056516, T2i, KP587785252 * T2h);
Chris@42 824 T2r = FNMS(KP951056516, T26, KP587785252 * T23);
Chris@42 825 {
Chris@42 826 E T2c, T2f, T1Y, T1Z;
Chris@42 827 T2c = KP559016994 * (T2a - T2b);
Chris@42 828 T2f = FNMS(KP250000000, T2e, T2d);
Chris@42 829 T2g = T2c + T2f;
Chris@42 830 T2u = T2f - T2c;
Chris@42 831 T1Y = KP559016994 * (Tm - TB);
Chris@42 832 T1Z = FNMS(KP250000000, TC, T7);
Chris@42 833 T20 = T1Y + T1Z;
Chris@42 834 T2q = T1Z - T1Y;
Chris@42 835 }
Chris@42 836 {
Chris@42 837 E T28, T2k, T1X, T29;
Chris@42 838 T28 = T20 + T27;
Chris@42 839 T2k = T2g - T2j;
Chris@42 840 T1X = W[6];
Chris@42 841 T29 = W[7];
Chris@42 842 cr[WS(rs, 4)] = FNMS(T29, T2k, T1X * T28);
Chris@42 843 ci[WS(rs, 4)] = FMA(T29, T28, T1X * T2k);
Chris@42 844 }
Chris@42 845 {
Chris@42 846 E T2y, T2A, T2x, T2z;
Chris@42 847 T2y = T2q - T2r;
Chris@42 848 T2A = T2v + T2u;
Chris@42 849 T2x = W[22];
Chris@42 850 T2z = W[23];
Chris@42 851 cr[WS(rs, 12)] = FNMS(T2z, T2A, T2x * T2y);
Chris@42 852 ci[WS(rs, 12)] = FMA(T2z, T2y, T2x * T2A);
Chris@42 853 }
Chris@42 854 {
Chris@42 855 E T2m, T2o, T2l, T2n;
Chris@42 856 T2m = T20 - T27;
Chris@42 857 T2o = T2j + T2g;
Chris@42 858 T2l = W[30];
Chris@42 859 T2n = W[31];
Chris@42 860 cr[WS(rs, 16)] = FNMS(T2n, T2o, T2l * T2m);
Chris@42 861 ci[WS(rs, 16)] = FMA(T2n, T2m, T2l * T2o);
Chris@42 862 }
Chris@42 863 {
Chris@42 864 E T2s, T2w, T2p, T2t;
Chris@42 865 T2s = T2q + T2r;
Chris@42 866 T2w = T2u - T2v;
Chris@42 867 T2p = W[14];
Chris@42 868 T2t = W[15];
Chris@42 869 cr[WS(rs, 8)] = FNMS(T2t, T2w, T2p * T2s);
Chris@42 870 ci[WS(rs, 8)] = FMA(T2t, T2s, T2p * T2w);
Chris@42 871 }
Chris@42 872 }
Chris@42 873 {
Chris@42 874 E T43, T4f, T4r, T4m, T4c, T4q, T3W, T4n;
Chris@42 875 T43 = FMA(KP951056516, T3Z, KP587785252 * T42);
Chris@42 876 T4f = FMA(KP951056516, T4d, KP587785252 * T4e);
Chris@42 877 T4r = FNMS(KP951056516, T4e, KP587785252 * T4d);
Chris@42 878 T4m = FNMS(KP951056516, T42, KP587785252 * T3Z);
Chris@42 879 {
Chris@42 880 E T48, T4b, T3S, T3V;
Chris@42 881 T48 = KP559016994 * (T46 - T47);
Chris@42 882 T4b = FNMS(KP250000000, T4a, T49);
Chris@42 883 T4c = T48 + T4b;
Chris@42 884 T4q = T4b - T48;
Chris@42 885 T3S = KP559016994 * (T3O - T3R);
Chris@42 886 T3V = FNMS(KP250000000, T3U, T3T);
Chris@42 887 T3W = T3S + T3V;
Chris@42 888 T4n = T3V - T3S;
Chris@42 889 }
Chris@42 890 {
Chris@42 891 E T44, T4g, T3L, T45;
Chris@42 892 T44 = T3W - T43;
Chris@42 893 T4g = T4c + T4f;
Chris@42 894 T3L = W[0];
Chris@42 895 T45 = W[1];
Chris@42 896 cr[WS(rs, 1)] = FNMS(T45, T4g, T3L * T44);
Chris@42 897 ci[WS(rs, 1)] = FMA(T3L, T4g, T45 * T44);
Chris@42 898 }
Chris@42 899 {
Chris@42 900 E T4u, T4w, T4t, T4v;
Chris@42 901 T4u = T4n - T4m;
Chris@42 902 T4w = T4q + T4r;
Chris@42 903 T4t = W[32];
Chris@42 904 T4v = W[33];
Chris@42 905 cr[WS(rs, 17)] = FNMS(T4v, T4w, T4t * T4u);
Chris@42 906 ci[WS(rs, 17)] = FMA(T4t, T4w, T4v * T4u);
Chris@42 907 }
Chris@42 908 {
Chris@42 909 E T4i, T4k, T4h, T4j;
Chris@42 910 T4i = T43 + T3W;
Chris@42 911 T4k = T4c - T4f;
Chris@42 912 T4h = W[16];
Chris@42 913 T4j = W[17];
Chris@42 914 cr[WS(rs, 9)] = FNMS(T4j, T4k, T4h * T4i);
Chris@42 915 ci[WS(rs, 9)] = FMA(T4h, T4k, T4j * T4i);
Chris@42 916 }
Chris@42 917 {
Chris@42 918 E T4o, T4s, T4l, T4p;
Chris@42 919 T4o = T4m + T4n;
Chris@42 920 T4s = T4q - T4r;
Chris@42 921 T4l = W[24];
Chris@42 922 T4p = W[25];
Chris@42 923 cr[WS(rs, 13)] = FNMS(T4p, T4s, T4l * T4o);
Chris@42 924 ci[WS(rs, 13)] = FMA(T4l, T4s, T4p * T4o);
Chris@42 925 }
Chris@42 926 }
Chris@42 927 {
Chris@42 928 E T1j, T1o, T1M, T1J, T1B, T1N, TO, T1I;
Chris@42 929 T1j = FNMS(KP951056516, T1i, KP587785252 * T13);
Chris@42 930 T1o = FNMS(KP951056516, T1n, KP587785252 * T1m);
Chris@42 931 T1M = FMA(KP951056516, T1m, KP587785252 * T1n);
Chris@42 932 T1J = FMA(KP951056516, T13, KP587785252 * T1i);
Chris@42 933 {
Chris@42 934 E T1z, T1A, TM, TN;
Chris@42 935 T1z = FNMS(KP250000000, T1y, T1v);
Chris@42 936 T1A = KP559016994 * (T1w - T1x);
Chris@42 937 T1B = T1z - T1A;
Chris@42 938 T1N = T1A + T1z;
Chris@42 939 TM = FNMS(KP250000000, TL, TE);
Chris@42 940 TN = KP559016994 * (TH - TK);
Chris@42 941 TO = TM - TN;
Chris@42 942 T1I = TN + TM;
Chris@42 943 }
Chris@42 944 {
Chris@42 945 E T1k, T1C, TD, T1l;
Chris@42 946 T1k = TO - T1j;
Chris@42 947 T1C = T1o + T1B;
Chris@42 948 TD = W[2];
Chris@42 949 T1l = W[3];
Chris@42 950 cr[WS(rs, 2)] = FNMS(T1l, T1C, TD * T1k);
Chris@42 951 ci[WS(rs, 2)] = FMA(T1l, T1k, TD * T1C);
Chris@42 952 }
Chris@42 953 {
Chris@42 954 E T1Q, T1S, T1P, T1R;
Chris@42 955 T1Q = T1I + T1J;
Chris@42 956 T1S = T1N - T1M;
Chris@42 957 T1P = W[26];
Chris@42 958 T1R = W[27];
Chris@42 959 cr[WS(rs, 14)] = FNMS(T1R, T1S, T1P * T1Q);
Chris@42 960 ci[WS(rs, 14)] = FMA(T1R, T1Q, T1P * T1S);
Chris@42 961 }
Chris@42 962 {
Chris@42 963 E T1E, T1G, T1D, T1F;
Chris@42 964 T1E = TO + T1j;
Chris@42 965 T1G = T1B - T1o;
Chris@42 966 T1D = W[34];
Chris@42 967 T1F = W[35];
Chris@42 968 cr[WS(rs, 18)] = FNMS(T1F, T1G, T1D * T1E);
Chris@42 969 ci[WS(rs, 18)] = FMA(T1F, T1E, T1D * T1G);
Chris@42 970 }
Chris@42 971 {
Chris@42 972 E T1K, T1O, T1H, T1L;
Chris@42 973 T1K = T1I - T1J;
Chris@42 974 T1O = T1M + T1N;
Chris@42 975 T1H = W[10];
Chris@42 976 T1L = W[11];
Chris@42 977 cr[WS(rs, 6)] = FNMS(T1L, T1O, T1H * T1K);
Chris@42 978 ci[WS(rs, 6)] = FMA(T1L, T1K, T1H * T1O);
Chris@42 979 }
Chris@42 980 }
Chris@42 981 {
Chris@42 982 E T2Q, T3p, T3B, T3x, T3m, T3A, T3b, T3w;
Chris@42 983 T2Q = FNMS(KP951056516, T2P, KP587785252 * T2I);
Chris@42 984 T3p = FNMS(KP951056516, T3o, KP587785252 * T3n);
Chris@42 985 T3B = FMA(KP951056516, T3n, KP587785252 * T3o);
Chris@42 986 T3x = FMA(KP951056516, T2I, KP587785252 * T2P);
Chris@42 987 {
Chris@42 988 E T3k, T3l, T39, T3a;
Chris@42 989 T3k = FNMS(KP250000000, T3j, T3g);
Chris@42 990 T3l = KP559016994 * (T3h - T3i);
Chris@42 991 T3m = T3k - T3l;
Chris@42 992 T3A = T3l + T3k;
Chris@42 993 T39 = FNMS(KP250000000, T38, T2T);
Chris@42 994 T3a = KP559016994 * (T30 - T37);
Chris@42 995 T3b = T39 - T3a;
Chris@42 996 T3w = T3a + T39;
Chris@42 997 }
Chris@42 998 {
Chris@42 999 E T3c, T3q, T2B, T3d;
Chris@42 1000 T3c = T2Q + T3b;
Chris@42 1001 T3q = T3m - T3p;
Chris@42 1002 T2B = W[4];
Chris@42 1003 T3d = W[5];
Chris@42 1004 cr[WS(rs, 3)] = FNMS(T3d, T3q, T2B * T3c);
Chris@42 1005 ci[WS(rs, 3)] = FMA(T2B, T3q, T3d * T3c);
Chris@42 1006 }
Chris@42 1007 {
Chris@42 1008 E T3E, T3G, T3D, T3F;
Chris@42 1009 T3E = T3x + T3w;
Chris@42 1010 T3G = T3A - T3B;
Chris@42 1011 T3D = W[36];
Chris@42 1012 T3F = W[37];
Chris@42 1013 cr[WS(rs, 19)] = FNMS(T3F, T3G, T3D * T3E);
Chris@42 1014 ci[WS(rs, 19)] = FMA(T3D, T3G, T3F * T3E);
Chris@42 1015 }
Chris@42 1016 {
Chris@42 1017 E T3s, T3u, T3r, T3t;
Chris@42 1018 T3s = T3b - T2Q;
Chris@42 1019 T3u = T3m + T3p;
Chris@42 1020 T3r = W[12];
Chris@42 1021 T3t = W[13];
Chris@42 1022 cr[WS(rs, 7)] = FNMS(T3t, T3u, T3r * T3s);
Chris@42 1023 ci[WS(rs, 7)] = FMA(T3r, T3u, T3t * T3s);
Chris@42 1024 }
Chris@42 1025 {
Chris@42 1026 E T3y, T3C, T3v, T3z;
Chris@42 1027 T3y = T3w - T3x;
Chris@42 1028 T3C = T3A + T3B;
Chris@42 1029 T3v = W[20];
Chris@42 1030 T3z = W[21];
Chris@42 1031 cr[WS(rs, 11)] = FNMS(T3z, T3C, T3v * T3y);
Chris@42 1032 ci[WS(rs, 11)] = FMA(T3v, T3C, T3z * T3y);
Chris@42 1033 }
Chris@42 1034 }
Chris@42 1035 }
Chris@42 1036 }
Chris@42 1037 }
Chris@42 1038
Chris@42 1039 static const tw_instr twinstr[] = {
Chris@42 1040 {TW_FULL, 1, 20},
Chris@42 1041 {TW_NEXT, 1, 0}
Chris@42 1042 };
Chris@42 1043
Chris@42 1044 static const hc2hc_desc desc = { 20, "hb_20", twinstr, &GENUS, {184, 62, 62, 0} };
Chris@42 1045
Chris@42 1046 void X(codelet_hb_20) (planner *p) {
Chris@42 1047 X(khc2hc_register) (p, hb_20, &desc);
Chris@42 1048 }
Chris@42 1049 #endif /* HAVE_FMA */