annotate src/fftw-3.3.8/rdft/scalar/r2cb/hb_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:36 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hb_20 -include rdft/scalar/hb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 246 FP additions, 148 FP multiplications,
Chris@82 32 * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
Chris@82 33 * 91 stack variables, 4 constants, and 80 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hb.h"
Chris@82 36
Chris@82 37 static void hb_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 46 E T7, T4e, T4z, TE, T1t, T2W, T3z, T2l, T13, T3G, T3H, T1i, T2g, T4H, T4G;
Chris@82 47 E T2d, T1B, T4u, T4r, T1A, T2s, T3l, T2t, T3s, T2m, T2n, T2o, T1u, T1v, T1w;
Chris@82 48 E TC, T29, T3C, T3E, T4l, T4n, TL, TN, T3b, T3d, T4C, T4E;
Chris@82 49 {
Chris@82 50 E T3, T2U, T1s, T2V, T6, T3y, T1p, T3x;
Chris@82 51 {
Chris@82 52 E T1, T2, T1q, T1r;
Chris@82 53 T1 = cr[0];
Chris@82 54 T2 = ci[WS(rs, 9)];
Chris@82 55 T3 = T1 + T2;
Chris@82 56 T2U = T1 - T2;
Chris@82 57 T1q = ci[WS(rs, 14)];
Chris@82 58 T1r = cr[WS(rs, 15)];
Chris@82 59 T1s = T1q - T1r;
Chris@82 60 T2V = T1q + T1r;
Chris@82 61 }
Chris@82 62 {
Chris@82 63 E T4, T5, T1n, T1o;
Chris@82 64 T4 = cr[WS(rs, 5)];
Chris@82 65 T5 = ci[WS(rs, 4)];
Chris@82 66 T6 = T4 + T5;
Chris@82 67 T3y = T4 - T5;
Chris@82 68 T1n = ci[WS(rs, 19)];
Chris@82 69 T1o = cr[WS(rs, 10)];
Chris@82 70 T1p = T1n - T1o;
Chris@82 71 T3x = T1n + T1o;
Chris@82 72 }
Chris@82 73 T7 = T3 + T6;
Chris@82 74 T4e = T2U - T2V;
Chris@82 75 T4z = T3y + T3x;
Chris@82 76 TE = T3 - T6;
Chris@82 77 T1t = T1p - T1s;
Chris@82 78 T2W = T2U + T2V;
Chris@82 79 T3z = T3x - T3y;
Chris@82 80 T2l = T1p + T1s;
Chris@82 81 }
Chris@82 82 {
Chris@82 83 E Te, T4f, T4p, TF, T1a, T2Z, T3o, T2b, TA, T4j, T4t, TJ, T12, T39, T3k;
Chris@82 84 E T2f, Tl, T4g, T4q, TG, T1h, T32, T3r, T2c, Tt, T4i, T4s, TI, TV, T36;
Chris@82 85 E T3h, T2e;
Chris@82 86 {
Chris@82 87 E Ta, T2X, T19, T2Y, Td, T3n, T16, T3m;
Chris@82 88 {
Chris@82 89 E T8, T9, T17, T18;
Chris@82 90 T8 = cr[WS(rs, 4)];
Chris@82 91 T9 = ci[WS(rs, 5)];
Chris@82 92 Ta = T8 + T9;
Chris@82 93 T2X = T8 - T9;
Chris@82 94 T17 = ci[WS(rs, 10)];
Chris@82 95 T18 = cr[WS(rs, 19)];
Chris@82 96 T19 = T17 - T18;
Chris@82 97 T2Y = T17 + T18;
Chris@82 98 }
Chris@82 99 {
Chris@82 100 E Tb, Tc, T14, T15;
Chris@82 101 Tb = cr[WS(rs, 9)];
Chris@82 102 Tc = ci[0];
Chris@82 103 Td = Tb + Tc;
Chris@82 104 T3n = Tb - Tc;
Chris@82 105 T14 = ci[WS(rs, 15)];
Chris@82 106 T15 = cr[WS(rs, 14)];
Chris@82 107 T16 = T14 - T15;
Chris@82 108 T3m = T14 + T15;
Chris@82 109 }
Chris@82 110 Te = Ta + Td;
Chris@82 111 T4f = T2X - T2Y;
Chris@82 112 T4p = T3n + T3m;
Chris@82 113 TF = Ta - Td;
Chris@82 114 T1a = T16 - T19;
Chris@82 115 T2Z = T2X + T2Y;
Chris@82 116 T3o = T3m - T3n;
Chris@82 117 T2b = T16 + T19;
Chris@82 118 }
Chris@82 119 {
Chris@82 120 E Tw, T37, Tz, T3i, TY, T3j, T11, T38;
Chris@82 121 {
Chris@82 122 E Tu, Tv, Tx, Ty;
Chris@82 123 Tu = ci[WS(rs, 7)];
Chris@82 124 Tv = cr[WS(rs, 2)];
Chris@82 125 Tw = Tu + Tv;
Chris@82 126 T37 = Tu - Tv;
Chris@82 127 Tx = ci[WS(rs, 2)];
Chris@82 128 Ty = cr[WS(rs, 7)];
Chris@82 129 Tz = Tx + Ty;
Chris@82 130 T3i = Tx - Ty;
Chris@82 131 }
Chris@82 132 {
Chris@82 133 E TW, TX, TZ, T10;
Chris@82 134 TW = ci[WS(rs, 17)];
Chris@82 135 TX = cr[WS(rs, 12)];
Chris@82 136 TY = TW - TX;
Chris@82 137 T3j = TW + TX;
Chris@82 138 TZ = ci[WS(rs, 12)];
Chris@82 139 T10 = cr[WS(rs, 17)];
Chris@82 140 T11 = TZ - T10;
Chris@82 141 T38 = TZ + T10;
Chris@82 142 }
Chris@82 143 TA = Tw + Tz;
Chris@82 144 T4j = T37 + T38;
Chris@82 145 T4t = T3i - T3j;
Chris@82 146 TJ = Tw - Tz;
Chris@82 147 T12 = TY - T11;
Chris@82 148 T39 = T37 - T38;
Chris@82 149 T3k = T3i + T3j;
Chris@82 150 T2f = TY + T11;
Chris@82 151 }
Chris@82 152 {
Chris@82 153 E Th, T30, T1g, T31, Tk, T3p, T1d, T3q;
Chris@82 154 {
Chris@82 155 E Tf, Tg, T1e, T1f;
Chris@82 156 Tf = ci[WS(rs, 3)];
Chris@82 157 Tg = cr[WS(rs, 6)];
Chris@82 158 Th = Tf + Tg;
Chris@82 159 T30 = Tf - Tg;
Chris@82 160 T1e = ci[WS(rs, 18)];
Chris@82 161 T1f = cr[WS(rs, 11)];
Chris@82 162 T1g = T1e - T1f;
Chris@82 163 T31 = T1e + T1f;
Chris@82 164 }
Chris@82 165 {
Chris@82 166 E Ti, Tj, T1b, T1c;
Chris@82 167 Ti = cr[WS(rs, 1)];
Chris@82 168 Tj = ci[WS(rs, 8)];
Chris@82 169 Tk = Ti + Tj;
Chris@82 170 T3p = Ti - Tj;
Chris@82 171 T1b = ci[WS(rs, 13)];
Chris@82 172 T1c = cr[WS(rs, 16)];
Chris@82 173 T1d = T1b - T1c;
Chris@82 174 T3q = T1b + T1c;
Chris@82 175 }
Chris@82 176 Tl = Th + Tk;
Chris@82 177 T4g = T30 - T31;
Chris@82 178 T4q = T3p - T3q;
Chris@82 179 TG = Th - Tk;
Chris@82 180 T1h = T1d - T1g;
Chris@82 181 T32 = T30 + T31;
Chris@82 182 T3r = T3p + T3q;
Chris@82 183 T2c = T1d + T1g;
Chris@82 184 }
Chris@82 185 {
Chris@82 186 E Tp, T34, TU, T35, Ts, T3g, TR, T3f;
Chris@82 187 {
Chris@82 188 E Tn, To, TS, TT;
Chris@82 189 Tn = cr[WS(rs, 8)];
Chris@82 190 To = ci[WS(rs, 1)];
Chris@82 191 Tp = Tn + To;
Chris@82 192 T34 = Tn - To;
Chris@82 193 TS = ci[WS(rs, 16)];
Chris@82 194 TT = cr[WS(rs, 13)];
Chris@82 195 TU = TS - TT;
Chris@82 196 T35 = TS + TT;
Chris@82 197 }
Chris@82 198 {
Chris@82 199 E Tq, Tr, TP, TQ;
Chris@82 200 Tq = ci[WS(rs, 6)];
Chris@82 201 Tr = cr[WS(rs, 3)];
Chris@82 202 Ts = Tq + Tr;
Chris@82 203 T3g = Tq - Tr;
Chris@82 204 TP = ci[WS(rs, 11)];
Chris@82 205 TQ = cr[WS(rs, 18)];
Chris@82 206 TR = TP - TQ;
Chris@82 207 T3f = TP + TQ;
Chris@82 208 }
Chris@82 209 Tt = Tp + Ts;
Chris@82 210 T4i = T34 + T35;
Chris@82 211 T4s = T3g + T3f;
Chris@82 212 TI = Tp - Ts;
Chris@82 213 TV = TR - TU;
Chris@82 214 T36 = T34 - T35;
Chris@82 215 T3h = T3f - T3g;
Chris@82 216 T2e = TR + TU;
Chris@82 217 }
Chris@82 218 T13 = TV - T12;
Chris@82 219 T3G = T36 - T39;
Chris@82 220 T3H = T2Z - T32;
Chris@82 221 T1i = T1a - T1h;
Chris@82 222 T2g = T2e - T2f;
Chris@82 223 T4H = T4i - T4j;
Chris@82 224 T4G = T4f - T4g;
Chris@82 225 T2d = T2b - T2c;
Chris@82 226 T1B = TF - TG;
Chris@82 227 T4u = T4s - T4t;
Chris@82 228 T4r = T4p - T4q;
Chris@82 229 T1A = TI - TJ;
Chris@82 230 T2s = Te - Tl;
Chris@82 231 T3l = T3h + T3k;
Chris@82 232 T2t = Tt - TA;
Chris@82 233 T3s = T3o + T3r;
Chris@82 234 T2m = T2b + T2c;
Chris@82 235 T2n = T2e + T2f;
Chris@82 236 T2o = T2m + T2n;
Chris@82 237 T1u = T1a + T1h;
Chris@82 238 T1v = TV + T12;
Chris@82 239 T1w = T1u + T1v;
Chris@82 240 {
Chris@82 241 E Tm, TB, TH, TK;
Chris@82 242 Tm = Te + Tl;
Chris@82 243 TB = Tt + TA;
Chris@82 244 TC = Tm + TB;
Chris@82 245 T29 = Tm - TB;
Chris@82 246 {
Chris@82 247 E T3A, T3B, T4h, T4k;
Chris@82 248 T3A = T3o - T3r;
Chris@82 249 T3B = T3h - T3k;
Chris@82 250 T3C = T3A + T3B;
Chris@82 251 T3E = T3A - T3B;
Chris@82 252 T4h = T4f + T4g;
Chris@82 253 T4k = T4i + T4j;
Chris@82 254 T4l = T4h + T4k;
Chris@82 255 T4n = T4h - T4k;
Chris@82 256 }
Chris@82 257 TH = TF + TG;
Chris@82 258 TK = TI + TJ;
Chris@82 259 TL = TH + TK;
Chris@82 260 TN = TH - TK;
Chris@82 261 {
Chris@82 262 E T33, T3a, T4A, T4B;
Chris@82 263 T33 = T2Z + T32;
Chris@82 264 T3a = T36 + T39;
Chris@82 265 T3b = T33 + T3a;
Chris@82 266 T3d = T33 - T3a;
Chris@82 267 T4A = T4p + T4q;
Chris@82 268 T4B = T4s + T4t;
Chris@82 269 T4C = T4A + T4B;
Chris@82 270 T4E = T4A - T4B;
Chris@82 271 }
Chris@82 272 }
Chris@82 273 }
Chris@82 274 cr[0] = T7 + TC;
Chris@82 275 ci[0] = T2l + T2o;
Chris@82 276 {
Chris@82 277 E T25, T21, T23, T24, T26, T22;
Chris@82 278 T25 = T1t + T1w;
Chris@82 279 T22 = TE + TL;
Chris@82 280 T21 = W[18];
Chris@82 281 T23 = T21 * T22;
Chris@82 282 T24 = W[19];
Chris@82 283 T26 = T24 * T22;
Chris@82 284 cr[WS(rs, 10)] = FNMS(T24, T25, T23);
Chris@82 285 ci[WS(rs, 10)] = FMA(T21, T25, T26);
Chris@82 286 }
Chris@82 287 {
Chris@82 288 E T58, T5b, T59, T5c, T57, T5a;
Chris@82 289 T58 = T4e + T4l;
Chris@82 290 T5b = T4z + T4C;
Chris@82 291 T57 = W[8];
Chris@82 292 T59 = T57 * T58;
Chris@82 293 T5c = T57 * T5b;
Chris@82 294 T5a = W[9];
Chris@82 295 cr[WS(rs, 5)] = FNMS(T5a, T5b, T59);
Chris@82 296 ci[WS(rs, 5)] = FMA(T5a, T58, T5c);
Chris@82 297 }
Chris@82 298 {
Chris@82 299 E T48, T4b, T49, T4c, T47, T4a;
Chris@82 300 T48 = T2W + T3b;
Chris@82 301 T4b = T3z + T3C;
Chris@82 302 T47 = W[28];
Chris@82 303 T49 = T47 * T48;
Chris@82 304 T4c = T47 * T4b;
Chris@82 305 T4a = W[29];
Chris@82 306 cr[WS(rs, 15)] = FNMS(T4a, T4b, T49);
Chris@82 307 ci[WS(rs, 15)] = FMA(T4a, T48, T4c);
Chris@82 308 }
Chris@82 309 {
Chris@82 310 E T3u, T42, T3M, T3U, T3J, T45, T3P, T3Z;
Chris@82 311 {
Chris@82 312 E T3t, T3T, T3e, T3S, T3c;
Chris@82 313 T3t = FNMS(KP618033988, T3s, T3l);
Chris@82 314 T3T = FMA(KP618033988, T3l, T3s);
Chris@82 315 T3c = FNMS(KP250000000, T3b, T2W);
Chris@82 316 T3e = FNMS(KP559016994, T3d, T3c);
Chris@82 317 T3S = FMA(KP559016994, T3d, T3c);
Chris@82 318 T3u = FNMS(KP951056516, T3t, T3e);
Chris@82 319 T42 = FMA(KP951056516, T3T, T3S);
Chris@82 320 T3M = FMA(KP951056516, T3t, T3e);
Chris@82 321 T3U = FNMS(KP951056516, T3T, T3S);
Chris@82 322 }
Chris@82 323 {
Chris@82 324 E T3I, T3Y, T3F, T3X, T3D;
Chris@82 325 T3I = FNMS(KP618033988, T3H, T3G);
Chris@82 326 T3Y = FMA(KP618033988, T3G, T3H);
Chris@82 327 T3D = FNMS(KP250000000, T3C, T3z);
Chris@82 328 T3F = FNMS(KP559016994, T3E, T3D);
Chris@82 329 T3X = FMA(KP559016994, T3E, T3D);
Chris@82 330 T3J = FMA(KP951056516, T3I, T3F);
Chris@82 331 T45 = FNMS(KP951056516, T3Y, T3X);
Chris@82 332 T3P = FNMS(KP951056516, T3I, T3F);
Chris@82 333 T3Z = FMA(KP951056516, T3Y, T3X);
Chris@82 334 }
Chris@82 335 {
Chris@82 336 E T3v, T3K, T2T, T3w;
Chris@82 337 T2T = W[4];
Chris@82 338 T3v = T2T * T3u;
Chris@82 339 T3K = T2T * T3J;
Chris@82 340 T3w = W[5];
Chris@82 341 cr[WS(rs, 3)] = FNMS(T3w, T3J, T3v);
Chris@82 342 ci[WS(rs, 3)] = FMA(T3w, T3u, T3K);
Chris@82 343 }
Chris@82 344 {
Chris@82 345 E T43, T46, T41, T44;
Chris@82 346 T41 = W[36];
Chris@82 347 T43 = T41 * T42;
Chris@82 348 T46 = T41 * T45;
Chris@82 349 T44 = W[37];
Chris@82 350 cr[WS(rs, 19)] = FNMS(T44, T45, T43);
Chris@82 351 ci[WS(rs, 19)] = FMA(T44, T42, T46);
Chris@82 352 }
Chris@82 353 {
Chris@82 354 E T3N, T3Q, T3L, T3O;
Chris@82 355 T3L = W[12];
Chris@82 356 T3N = T3L * T3M;
Chris@82 357 T3Q = T3L * T3P;
Chris@82 358 T3O = W[13];
Chris@82 359 cr[WS(rs, 7)] = FNMS(T3O, T3P, T3N);
Chris@82 360 ci[WS(rs, 7)] = FMA(T3O, T3M, T3Q);
Chris@82 361 }
Chris@82 362 {
Chris@82 363 E T3V, T40, T3R, T3W;
Chris@82 364 T3R = W[20];
Chris@82 365 T3V = T3R * T3U;
Chris@82 366 T40 = T3R * T3Z;
Chris@82 367 T3W = W[21];
Chris@82 368 cr[WS(rs, 11)] = FNMS(T3W, T3Z, T3V);
Chris@82 369 ci[WS(rs, 11)] = FMA(T3W, T3U, T40);
Chris@82 370 }
Chris@82 371 }
Chris@82 372 {
Chris@82 373 E T4w, T52, T4M, T4U, T4J, T55, T4P, T4Z;
Chris@82 374 {
Chris@82 375 E T4v, T4T, T4o, T4S, T4m;
Chris@82 376 T4v = FMA(KP618033988, T4u, T4r);
Chris@82 377 T4T = FNMS(KP618033988, T4r, T4u);
Chris@82 378 T4m = FNMS(KP250000000, T4l, T4e);
Chris@82 379 T4o = FMA(KP559016994, T4n, T4m);
Chris@82 380 T4S = FNMS(KP559016994, T4n, T4m);
Chris@82 381 T4w = FNMS(KP951056516, T4v, T4o);
Chris@82 382 T52 = FMA(KP951056516, T4T, T4S);
Chris@82 383 T4M = FMA(KP951056516, T4v, T4o);
Chris@82 384 T4U = FNMS(KP951056516, T4T, T4S);
Chris@82 385 }
Chris@82 386 {
Chris@82 387 E T4I, T4Y, T4F, T4X, T4D;
Chris@82 388 T4I = FMA(KP618033988, T4H, T4G);
Chris@82 389 T4Y = FNMS(KP618033988, T4G, T4H);
Chris@82 390 T4D = FNMS(KP250000000, T4C, T4z);
Chris@82 391 T4F = FMA(KP559016994, T4E, T4D);
Chris@82 392 T4X = FNMS(KP559016994, T4E, T4D);
Chris@82 393 T4J = FMA(KP951056516, T4I, T4F);
Chris@82 394 T55 = FNMS(KP951056516, T4Y, T4X);
Chris@82 395 T4P = FNMS(KP951056516, T4I, T4F);
Chris@82 396 T4Z = FMA(KP951056516, T4Y, T4X);
Chris@82 397 }
Chris@82 398 {
Chris@82 399 E T4x, T4K, T4d, T4y;
Chris@82 400 T4d = W[0];
Chris@82 401 T4x = T4d * T4w;
Chris@82 402 T4K = T4d * T4J;
Chris@82 403 T4y = W[1];
Chris@82 404 cr[WS(rs, 1)] = FNMS(T4y, T4J, T4x);
Chris@82 405 ci[WS(rs, 1)] = FMA(T4y, T4w, T4K);
Chris@82 406 }
Chris@82 407 {
Chris@82 408 E T53, T56, T51, T54;
Chris@82 409 T51 = W[32];
Chris@82 410 T53 = T51 * T52;
Chris@82 411 T56 = T51 * T55;
Chris@82 412 T54 = W[33];
Chris@82 413 cr[WS(rs, 17)] = FNMS(T54, T55, T53);
Chris@82 414 ci[WS(rs, 17)] = FMA(T54, T52, T56);
Chris@82 415 }
Chris@82 416 {
Chris@82 417 E T4N, T4Q, T4L, T4O;
Chris@82 418 T4L = W[16];
Chris@82 419 T4N = T4L * T4M;
Chris@82 420 T4Q = T4L * T4P;
Chris@82 421 T4O = W[17];
Chris@82 422 cr[WS(rs, 9)] = FNMS(T4O, T4P, T4N);
Chris@82 423 ci[WS(rs, 9)] = FMA(T4O, T4M, T4Q);
Chris@82 424 }
Chris@82 425 {
Chris@82 426 E T4V, T50, T4R, T4W;
Chris@82 427 T4R = W[24];
Chris@82 428 T4V = T4R * T4U;
Chris@82 429 T50 = T4R * T4Z;
Chris@82 430 T4W = W[25];
Chris@82 431 cr[WS(rs, 13)] = FNMS(T4W, T4Z, T4V);
Chris@82 432 ci[WS(rs, 13)] = FMA(T4W, T4U, T50);
Chris@82 433 }
Chris@82 434 }
Chris@82 435 {
Chris@82 436 E T2u, T2K, T2r, T2J, T2i, T2O, T2y, T2G, T2p, T2q;
Chris@82 437 T2u = FMA(KP618033988, T2t, T2s);
Chris@82 438 T2K = FNMS(KP618033988, T2s, T2t);
Chris@82 439 T2p = FNMS(KP250000000, T2o, T2l);
Chris@82 440 T2q = T2m - T2n;
Chris@82 441 T2r = FMA(KP559016994, T2q, T2p);
Chris@82 442 T2J = FNMS(KP559016994, T2q, T2p);
Chris@82 443 {
Chris@82 444 E T2h, T2F, T2a, T2E, T28;
Chris@82 445 T2h = FMA(KP618033988, T2g, T2d);
Chris@82 446 T2F = FNMS(KP618033988, T2d, T2g);
Chris@82 447 T28 = FNMS(KP250000000, TC, T7);
Chris@82 448 T2a = FMA(KP559016994, T29, T28);
Chris@82 449 T2E = FNMS(KP559016994, T29, T28);
Chris@82 450 T2i = FMA(KP951056516, T2h, T2a);
Chris@82 451 T2O = FMA(KP951056516, T2F, T2E);
Chris@82 452 T2y = FNMS(KP951056516, T2h, T2a);
Chris@82 453 T2G = FNMS(KP951056516, T2F, T2E);
Chris@82 454 }
Chris@82 455 {
Chris@82 456 E T2v, T2k, T2w, T27, T2j;
Chris@82 457 T2v = FNMS(KP951056516, T2u, T2r);
Chris@82 458 T2k = W[7];
Chris@82 459 T2w = T2k * T2i;
Chris@82 460 T27 = W[6];
Chris@82 461 T2j = T27 * T2i;
Chris@82 462 cr[WS(rs, 4)] = FNMS(T2k, T2v, T2j);
Chris@82 463 ci[WS(rs, 4)] = FMA(T27, T2v, T2w);
Chris@82 464 }
Chris@82 465 {
Chris@82 466 E T2R, T2Q, T2S, T2N, T2P;
Chris@82 467 T2R = FNMS(KP951056516, T2K, T2J);
Chris@82 468 T2Q = W[23];
Chris@82 469 T2S = T2Q * T2O;
Chris@82 470 T2N = W[22];
Chris@82 471 T2P = T2N * T2O;
Chris@82 472 cr[WS(rs, 12)] = FNMS(T2Q, T2R, T2P);
Chris@82 473 ci[WS(rs, 12)] = FMA(T2N, T2R, T2S);
Chris@82 474 }
Chris@82 475 {
Chris@82 476 E T2B, T2A, T2C, T2x, T2z;
Chris@82 477 T2B = FMA(KP951056516, T2u, T2r);
Chris@82 478 T2A = W[31];
Chris@82 479 T2C = T2A * T2y;
Chris@82 480 T2x = W[30];
Chris@82 481 T2z = T2x * T2y;
Chris@82 482 cr[WS(rs, 16)] = FNMS(T2A, T2B, T2z);
Chris@82 483 ci[WS(rs, 16)] = FMA(T2x, T2B, T2C);
Chris@82 484 }
Chris@82 485 {
Chris@82 486 E T2L, T2I, T2M, T2D, T2H;
Chris@82 487 T2L = FMA(KP951056516, T2K, T2J);
Chris@82 488 T2I = W[15];
Chris@82 489 T2M = T2I * T2G;
Chris@82 490 T2D = W[14];
Chris@82 491 T2H = T2D * T2G;
Chris@82 492 cr[WS(rs, 8)] = FNMS(T2I, T2L, T2H);
Chris@82 493 ci[WS(rs, 8)] = FMA(T2D, T2L, T2M);
Chris@82 494 }
Chris@82 495 }
Chris@82 496 {
Chris@82 497 E T1C, T1S, T1z, T1R, T1k, T1W, T1G, T1O, T1x, T1y;
Chris@82 498 T1C = FNMS(KP618033988, T1B, T1A);
Chris@82 499 T1S = FMA(KP618033988, T1A, T1B);
Chris@82 500 T1x = FNMS(KP250000000, T1w, T1t);
Chris@82 501 T1y = T1u - T1v;
Chris@82 502 T1z = FNMS(KP559016994, T1y, T1x);
Chris@82 503 T1R = FMA(KP559016994, T1y, T1x);
Chris@82 504 {
Chris@82 505 E T1j, T1N, TO, T1M, TM;
Chris@82 506 T1j = FNMS(KP618033988, T1i, T13);
Chris@82 507 T1N = FMA(KP618033988, T13, T1i);
Chris@82 508 TM = FNMS(KP250000000, TL, TE);
Chris@82 509 TO = FNMS(KP559016994, TN, TM);
Chris@82 510 T1M = FMA(KP559016994, TN, TM);
Chris@82 511 T1k = FMA(KP951056516, T1j, TO);
Chris@82 512 T1W = FMA(KP951056516, T1N, T1M);
Chris@82 513 T1G = FNMS(KP951056516, T1j, TO);
Chris@82 514 T1O = FNMS(KP951056516, T1N, T1M);
Chris@82 515 }
Chris@82 516 {
Chris@82 517 E T1D, T1m, T1E, TD, T1l;
Chris@82 518 T1D = FNMS(KP951056516, T1C, T1z);
Chris@82 519 T1m = W[3];
Chris@82 520 T1E = T1m * T1k;
Chris@82 521 TD = W[2];
Chris@82 522 T1l = TD * T1k;
Chris@82 523 cr[WS(rs, 2)] = FNMS(T1m, T1D, T1l);
Chris@82 524 ci[WS(rs, 2)] = FMA(TD, T1D, T1E);
Chris@82 525 }
Chris@82 526 {
Chris@82 527 E T1Z, T1Y, T20, T1V, T1X;
Chris@82 528 T1Z = FNMS(KP951056516, T1S, T1R);
Chris@82 529 T1Y = W[27];
Chris@82 530 T20 = T1Y * T1W;
Chris@82 531 T1V = W[26];
Chris@82 532 T1X = T1V * T1W;
Chris@82 533 cr[WS(rs, 14)] = FNMS(T1Y, T1Z, T1X);
Chris@82 534 ci[WS(rs, 14)] = FMA(T1V, T1Z, T20);
Chris@82 535 }
Chris@82 536 {
Chris@82 537 E T1J, T1I, T1K, T1F, T1H;
Chris@82 538 T1J = FMA(KP951056516, T1C, T1z);
Chris@82 539 T1I = W[35];
Chris@82 540 T1K = T1I * T1G;
Chris@82 541 T1F = W[34];
Chris@82 542 T1H = T1F * T1G;
Chris@82 543 cr[WS(rs, 18)] = FNMS(T1I, T1J, T1H);
Chris@82 544 ci[WS(rs, 18)] = FMA(T1F, T1J, T1K);
Chris@82 545 }
Chris@82 546 {
Chris@82 547 E T1T, T1Q, T1U, T1L, T1P;
Chris@82 548 T1T = FMA(KP951056516, T1S, T1R);
Chris@82 549 T1Q = W[11];
Chris@82 550 T1U = T1Q * T1O;
Chris@82 551 T1L = W[10];
Chris@82 552 T1P = T1L * T1O;
Chris@82 553 cr[WS(rs, 6)] = FNMS(T1Q, T1T, T1P);
Chris@82 554 ci[WS(rs, 6)] = FMA(T1L, T1T, T1U);
Chris@82 555 }
Chris@82 556 }
Chris@82 557 }
Chris@82 558 }
Chris@82 559 }
Chris@82 560
Chris@82 561 static const tw_instr twinstr[] = {
Chris@82 562 {TW_FULL, 1, 20},
Chris@82 563 {TW_NEXT, 1, 0}
Chris@82 564 };
Chris@82 565
Chris@82 566 static const hc2hc_desc desc = { 20, "hb_20", twinstr, &GENUS, {136, 38, 110, 0} };
Chris@82 567
Chris@82 568 void X(codelet_hb_20) (planner *p) {
Chris@82 569 X(khc2hc_register) (p, hb_20, &desc);
Chris@82 570 }
Chris@82 571 #else
Chris@82 572
Chris@82 573 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hb_20 -include rdft/scalar/hb.h */
Chris@82 574
Chris@82 575 /*
Chris@82 576 * This function contains 246 FP additions, 124 FP multiplications,
Chris@82 577 * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
Chris@82 578 * 97 stack variables, 4 constants, and 80 memory accesses
Chris@82 579 */
Chris@82 580 #include "rdft/scalar/hb.h"
Chris@82 581
Chris@82 582 static void hb_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 583 {
Chris@82 584 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 585 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 586 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 587 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 588 {
Chris@82 589 INT m;
Chris@82 590 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 591 E T7, T3T, T49, TE, T1v, T2T, T3g, T2d, T13, T3n, T3o, T1i, T26, T4e, T4d;
Chris@82 592 E T23, T1n, T42, T3Z, T1m, T2h, T2I, T2i, T2P, T30, T37, T38, Tm, TB, TC;
Chris@82 593 E T46, T47, T4a, T2a, T2b, T2e, T1w, T1x, T1y, T3O, T3R, T3U, T3h, T3i, T3j;
Chris@82 594 E TH, TK, TL;
Chris@82 595 {
Chris@82 596 E T3, T2R, T1u, T2S, T6, T3f, T1r, T3e;
Chris@82 597 {
Chris@82 598 E T1, T2, T1s, T1t;
Chris@82 599 T1 = cr[0];
Chris@82 600 T2 = ci[WS(rs, 9)];
Chris@82 601 T3 = T1 + T2;
Chris@82 602 T2R = T1 - T2;
Chris@82 603 T1s = ci[WS(rs, 14)];
Chris@82 604 T1t = cr[WS(rs, 15)];
Chris@82 605 T1u = T1s - T1t;
Chris@82 606 T2S = T1s + T1t;
Chris@82 607 }
Chris@82 608 {
Chris@82 609 E T4, T5, T1p, T1q;
Chris@82 610 T4 = cr[WS(rs, 5)];
Chris@82 611 T5 = ci[WS(rs, 4)];
Chris@82 612 T6 = T4 + T5;
Chris@82 613 T3f = T4 - T5;
Chris@82 614 T1p = ci[WS(rs, 19)];
Chris@82 615 T1q = cr[WS(rs, 10)];
Chris@82 616 T1r = T1p - T1q;
Chris@82 617 T3e = T1p + T1q;
Chris@82 618 }
Chris@82 619 T7 = T3 + T6;
Chris@82 620 T3T = T2R - T2S;
Chris@82 621 T49 = T3f + T3e;
Chris@82 622 TE = T3 - T6;
Chris@82 623 T1v = T1r - T1u;
Chris@82 624 T2T = T2R + T2S;
Chris@82 625 T3g = T3e - T3f;
Chris@82 626 T2d = T1r + T1u;
Chris@82 627 }
Chris@82 628 {
Chris@82 629 E Te, T3M, T3X, TF, TV, T2E, T2W, T21, TA, T3Q, T41, TJ, T1h, T2O, T36;
Chris@82 630 E T25, Tl, T3N, T3Y, TG, T12, T2H, T2Z, T22, Tt, T3P, T40, TI, T1a, T2L;
Chris@82 631 E T33, T24;
Chris@82 632 {
Chris@82 633 E Ta, T2U, TU, T2V, Td, T2D, TR, T2C;
Chris@82 634 {
Chris@82 635 E T8, T9, TS, TT;
Chris@82 636 T8 = cr[WS(rs, 4)];
Chris@82 637 T9 = ci[WS(rs, 5)];
Chris@82 638 Ta = T8 + T9;
Chris@82 639 T2U = T8 - T9;
Chris@82 640 TS = ci[WS(rs, 10)];
Chris@82 641 TT = cr[WS(rs, 19)];
Chris@82 642 TU = TS - TT;
Chris@82 643 T2V = TS + TT;
Chris@82 644 }
Chris@82 645 {
Chris@82 646 E Tb, Tc, TP, TQ;
Chris@82 647 Tb = cr[WS(rs, 9)];
Chris@82 648 Tc = ci[0];
Chris@82 649 Td = Tb + Tc;
Chris@82 650 T2D = Tb - Tc;
Chris@82 651 TP = ci[WS(rs, 15)];
Chris@82 652 TQ = cr[WS(rs, 14)];
Chris@82 653 TR = TP - TQ;
Chris@82 654 T2C = TP + TQ;
Chris@82 655 }
Chris@82 656 Te = Ta + Td;
Chris@82 657 T3M = T2U - T2V;
Chris@82 658 T3X = T2D + T2C;
Chris@82 659 TF = Ta - Td;
Chris@82 660 TV = TR - TU;
Chris@82 661 T2E = T2C - T2D;
Chris@82 662 T2W = T2U + T2V;
Chris@82 663 T21 = TR + TU;
Chris@82 664 }
Chris@82 665 {
Chris@82 666 E Tw, T34, Tz, T2M, T1d, T2N, T1g, T35;
Chris@82 667 {
Chris@82 668 E Tu, Tv, Tx, Ty;
Chris@82 669 Tu = ci[WS(rs, 7)];
Chris@82 670 Tv = cr[WS(rs, 2)];
Chris@82 671 Tw = Tu + Tv;
Chris@82 672 T34 = Tu - Tv;
Chris@82 673 Tx = ci[WS(rs, 2)];
Chris@82 674 Ty = cr[WS(rs, 7)];
Chris@82 675 Tz = Tx + Ty;
Chris@82 676 T2M = Tx - Ty;
Chris@82 677 }
Chris@82 678 {
Chris@82 679 E T1b, T1c, T1e, T1f;
Chris@82 680 T1b = ci[WS(rs, 17)];
Chris@82 681 T1c = cr[WS(rs, 12)];
Chris@82 682 T1d = T1b - T1c;
Chris@82 683 T2N = T1b + T1c;
Chris@82 684 T1e = ci[WS(rs, 12)];
Chris@82 685 T1f = cr[WS(rs, 17)];
Chris@82 686 T1g = T1e - T1f;
Chris@82 687 T35 = T1e + T1f;
Chris@82 688 }
Chris@82 689 TA = Tw + Tz;
Chris@82 690 T3Q = T34 + T35;
Chris@82 691 T41 = T2M - T2N;
Chris@82 692 TJ = Tw - Tz;
Chris@82 693 T1h = T1d - T1g;
Chris@82 694 T2O = T2M + T2N;
Chris@82 695 T36 = T34 - T35;
Chris@82 696 T25 = T1d + T1g;
Chris@82 697 }
Chris@82 698 {
Chris@82 699 E Th, T2X, T11, T2Y, Tk, T2F, TY, T2G;
Chris@82 700 {
Chris@82 701 E Tf, Tg, TZ, T10;
Chris@82 702 Tf = ci[WS(rs, 3)];
Chris@82 703 Tg = cr[WS(rs, 6)];
Chris@82 704 Th = Tf + Tg;
Chris@82 705 T2X = Tf - Tg;
Chris@82 706 TZ = ci[WS(rs, 18)];
Chris@82 707 T10 = cr[WS(rs, 11)];
Chris@82 708 T11 = TZ - T10;
Chris@82 709 T2Y = TZ + T10;
Chris@82 710 }
Chris@82 711 {
Chris@82 712 E Ti, Tj, TW, TX;
Chris@82 713 Ti = cr[WS(rs, 1)];
Chris@82 714 Tj = ci[WS(rs, 8)];
Chris@82 715 Tk = Ti + Tj;
Chris@82 716 T2F = Ti - Tj;
Chris@82 717 TW = ci[WS(rs, 13)];
Chris@82 718 TX = cr[WS(rs, 16)];
Chris@82 719 TY = TW - TX;
Chris@82 720 T2G = TW + TX;
Chris@82 721 }
Chris@82 722 Tl = Th + Tk;
Chris@82 723 T3N = T2X - T2Y;
Chris@82 724 T3Y = T2F - T2G;
Chris@82 725 TG = Th - Tk;
Chris@82 726 T12 = TY - T11;
Chris@82 727 T2H = T2F + T2G;
Chris@82 728 T2Z = T2X + T2Y;
Chris@82 729 T22 = TY + T11;
Chris@82 730 }
Chris@82 731 {
Chris@82 732 E Tp, T31, T19, T32, Ts, T2K, T16, T2J;
Chris@82 733 {
Chris@82 734 E Tn, To, T17, T18;
Chris@82 735 Tn = cr[WS(rs, 8)];
Chris@82 736 To = ci[WS(rs, 1)];
Chris@82 737 Tp = Tn + To;
Chris@82 738 T31 = Tn - To;
Chris@82 739 T17 = ci[WS(rs, 16)];
Chris@82 740 T18 = cr[WS(rs, 13)];
Chris@82 741 T19 = T17 - T18;
Chris@82 742 T32 = T17 + T18;
Chris@82 743 }
Chris@82 744 {
Chris@82 745 E Tq, Tr, T14, T15;
Chris@82 746 Tq = ci[WS(rs, 6)];
Chris@82 747 Tr = cr[WS(rs, 3)];
Chris@82 748 Ts = Tq + Tr;
Chris@82 749 T2K = Tq - Tr;
Chris@82 750 T14 = ci[WS(rs, 11)];
Chris@82 751 T15 = cr[WS(rs, 18)];
Chris@82 752 T16 = T14 - T15;
Chris@82 753 T2J = T14 + T15;
Chris@82 754 }
Chris@82 755 Tt = Tp + Ts;
Chris@82 756 T3P = T31 + T32;
Chris@82 757 T40 = T2K + T2J;
Chris@82 758 TI = Tp - Ts;
Chris@82 759 T1a = T16 - T19;
Chris@82 760 T2L = T2J - T2K;
Chris@82 761 T33 = T31 - T32;
Chris@82 762 T24 = T16 + T19;
Chris@82 763 }
Chris@82 764 T13 = TV - T12;
Chris@82 765 T3n = T2W - T2Z;
Chris@82 766 T3o = T33 - T36;
Chris@82 767 T1i = T1a - T1h;
Chris@82 768 T26 = T24 - T25;
Chris@82 769 T4e = T3P - T3Q;
Chris@82 770 T4d = T3M - T3N;
Chris@82 771 T23 = T21 - T22;
Chris@82 772 T1n = TI - TJ;
Chris@82 773 T42 = T40 - T41;
Chris@82 774 T3Z = T3X - T3Y;
Chris@82 775 T1m = TF - TG;
Chris@82 776 T2h = Te - Tl;
Chris@82 777 T2I = T2E + T2H;
Chris@82 778 T2i = Tt - TA;
Chris@82 779 T2P = T2L + T2O;
Chris@82 780 T30 = T2W + T2Z;
Chris@82 781 T37 = T33 + T36;
Chris@82 782 T38 = T30 + T37;
Chris@82 783 Tm = Te + Tl;
Chris@82 784 TB = Tt + TA;
Chris@82 785 TC = Tm + TB;
Chris@82 786 T46 = T3X + T3Y;
Chris@82 787 T47 = T40 + T41;
Chris@82 788 T4a = T46 + T47;
Chris@82 789 T2a = T21 + T22;
Chris@82 790 T2b = T24 + T25;
Chris@82 791 T2e = T2a + T2b;
Chris@82 792 T1w = TV + T12;
Chris@82 793 T1x = T1a + T1h;
Chris@82 794 T1y = T1w + T1x;
Chris@82 795 T3O = T3M + T3N;
Chris@82 796 T3R = T3P + T3Q;
Chris@82 797 T3U = T3O + T3R;
Chris@82 798 T3h = T2E - T2H;
Chris@82 799 T3i = T2L - T2O;
Chris@82 800 T3j = T3h + T3i;
Chris@82 801 TH = TF + TG;
Chris@82 802 TK = TI + TJ;
Chris@82 803 TL = TH + TK;
Chris@82 804 }
Chris@82 805 cr[0] = T7 + TC;
Chris@82 806 ci[0] = T2d + T2e;
Chris@82 807 {
Chris@82 808 E T1U, T1W, T1T, T1V;
Chris@82 809 T1U = TE + TL;
Chris@82 810 T1W = T1v + T1y;
Chris@82 811 T1T = W[18];
Chris@82 812 T1V = W[19];
Chris@82 813 cr[WS(rs, 10)] = FNMS(T1V, T1W, T1T * T1U);
Chris@82 814 ci[WS(rs, 10)] = FMA(T1V, T1U, T1T * T1W);
Chris@82 815 }
Chris@82 816 {
Chris@82 817 E T4y, T4A, T4x, T4z;
Chris@82 818 T4y = T3T + T3U;
Chris@82 819 T4A = T49 + T4a;
Chris@82 820 T4x = W[8];
Chris@82 821 T4z = W[9];
Chris@82 822 cr[WS(rs, 5)] = FNMS(T4z, T4A, T4x * T4y);
Chris@82 823 ci[WS(rs, 5)] = FMA(T4x, T4A, T4z * T4y);
Chris@82 824 }
Chris@82 825 {
Chris@82 826 E T3I, T3K, T3H, T3J;
Chris@82 827 T3I = T2T + T38;
Chris@82 828 T3K = T3g + T3j;
Chris@82 829 T3H = W[28];
Chris@82 830 T3J = W[29];
Chris@82 831 cr[WS(rs, 15)] = FNMS(T3J, T3K, T3H * T3I);
Chris@82 832 ci[WS(rs, 15)] = FMA(T3H, T3K, T3J * T3I);
Chris@82 833 }
Chris@82 834 {
Chris@82 835 E T27, T2j, T2v, T2r, T2g, T2u, T20, T2q;
Chris@82 836 T27 = FMA(KP951056516, T23, KP587785252 * T26);
Chris@82 837 T2j = FMA(KP951056516, T2h, KP587785252 * T2i);
Chris@82 838 T2v = FNMS(KP951056516, T2i, KP587785252 * T2h);
Chris@82 839 T2r = FNMS(KP951056516, T26, KP587785252 * T23);
Chris@82 840 {
Chris@82 841 E T2c, T2f, T1Y, T1Z;
Chris@82 842 T2c = KP559016994 * (T2a - T2b);
Chris@82 843 T2f = FNMS(KP250000000, T2e, T2d);
Chris@82 844 T2g = T2c + T2f;
Chris@82 845 T2u = T2f - T2c;
Chris@82 846 T1Y = KP559016994 * (Tm - TB);
Chris@82 847 T1Z = FNMS(KP250000000, TC, T7);
Chris@82 848 T20 = T1Y + T1Z;
Chris@82 849 T2q = T1Z - T1Y;
Chris@82 850 }
Chris@82 851 {
Chris@82 852 E T28, T2k, T1X, T29;
Chris@82 853 T28 = T20 + T27;
Chris@82 854 T2k = T2g - T2j;
Chris@82 855 T1X = W[6];
Chris@82 856 T29 = W[7];
Chris@82 857 cr[WS(rs, 4)] = FNMS(T29, T2k, T1X * T28);
Chris@82 858 ci[WS(rs, 4)] = FMA(T29, T28, T1X * T2k);
Chris@82 859 }
Chris@82 860 {
Chris@82 861 E T2y, T2A, T2x, T2z;
Chris@82 862 T2y = T2q - T2r;
Chris@82 863 T2A = T2v + T2u;
Chris@82 864 T2x = W[22];
Chris@82 865 T2z = W[23];
Chris@82 866 cr[WS(rs, 12)] = FNMS(T2z, T2A, T2x * T2y);
Chris@82 867 ci[WS(rs, 12)] = FMA(T2z, T2y, T2x * T2A);
Chris@82 868 }
Chris@82 869 {
Chris@82 870 E T2m, T2o, T2l, T2n;
Chris@82 871 T2m = T20 - T27;
Chris@82 872 T2o = T2j + T2g;
Chris@82 873 T2l = W[30];
Chris@82 874 T2n = W[31];
Chris@82 875 cr[WS(rs, 16)] = FNMS(T2n, T2o, T2l * T2m);
Chris@82 876 ci[WS(rs, 16)] = FMA(T2n, T2m, T2l * T2o);
Chris@82 877 }
Chris@82 878 {
Chris@82 879 E T2s, T2w, T2p, T2t;
Chris@82 880 T2s = T2q + T2r;
Chris@82 881 T2w = T2u - T2v;
Chris@82 882 T2p = W[14];
Chris@82 883 T2t = W[15];
Chris@82 884 cr[WS(rs, 8)] = FNMS(T2t, T2w, T2p * T2s);
Chris@82 885 ci[WS(rs, 8)] = FMA(T2t, T2s, T2p * T2w);
Chris@82 886 }
Chris@82 887 }
Chris@82 888 {
Chris@82 889 E T43, T4f, T4r, T4m, T4c, T4q, T3W, T4n;
Chris@82 890 T43 = FMA(KP951056516, T3Z, KP587785252 * T42);
Chris@82 891 T4f = FMA(KP951056516, T4d, KP587785252 * T4e);
Chris@82 892 T4r = FNMS(KP951056516, T4e, KP587785252 * T4d);
Chris@82 893 T4m = FNMS(KP951056516, T42, KP587785252 * T3Z);
Chris@82 894 {
Chris@82 895 E T48, T4b, T3S, T3V;
Chris@82 896 T48 = KP559016994 * (T46 - T47);
Chris@82 897 T4b = FNMS(KP250000000, T4a, T49);
Chris@82 898 T4c = T48 + T4b;
Chris@82 899 T4q = T4b - T48;
Chris@82 900 T3S = KP559016994 * (T3O - T3R);
Chris@82 901 T3V = FNMS(KP250000000, T3U, T3T);
Chris@82 902 T3W = T3S + T3V;
Chris@82 903 T4n = T3V - T3S;
Chris@82 904 }
Chris@82 905 {
Chris@82 906 E T44, T4g, T3L, T45;
Chris@82 907 T44 = T3W - T43;
Chris@82 908 T4g = T4c + T4f;
Chris@82 909 T3L = W[0];
Chris@82 910 T45 = W[1];
Chris@82 911 cr[WS(rs, 1)] = FNMS(T45, T4g, T3L * T44);
Chris@82 912 ci[WS(rs, 1)] = FMA(T3L, T4g, T45 * T44);
Chris@82 913 }
Chris@82 914 {
Chris@82 915 E T4u, T4w, T4t, T4v;
Chris@82 916 T4u = T4n - T4m;
Chris@82 917 T4w = T4q + T4r;
Chris@82 918 T4t = W[32];
Chris@82 919 T4v = W[33];
Chris@82 920 cr[WS(rs, 17)] = FNMS(T4v, T4w, T4t * T4u);
Chris@82 921 ci[WS(rs, 17)] = FMA(T4t, T4w, T4v * T4u);
Chris@82 922 }
Chris@82 923 {
Chris@82 924 E T4i, T4k, T4h, T4j;
Chris@82 925 T4i = T43 + T3W;
Chris@82 926 T4k = T4c - T4f;
Chris@82 927 T4h = W[16];
Chris@82 928 T4j = W[17];
Chris@82 929 cr[WS(rs, 9)] = FNMS(T4j, T4k, T4h * T4i);
Chris@82 930 ci[WS(rs, 9)] = FMA(T4h, T4k, T4j * T4i);
Chris@82 931 }
Chris@82 932 {
Chris@82 933 E T4o, T4s, T4l, T4p;
Chris@82 934 T4o = T4m + T4n;
Chris@82 935 T4s = T4q - T4r;
Chris@82 936 T4l = W[24];
Chris@82 937 T4p = W[25];
Chris@82 938 cr[WS(rs, 13)] = FNMS(T4p, T4s, T4l * T4o);
Chris@82 939 ci[WS(rs, 13)] = FMA(T4l, T4s, T4p * T4o);
Chris@82 940 }
Chris@82 941 }
Chris@82 942 {
Chris@82 943 E T1j, T1o, T1M, T1J, T1B, T1N, TO, T1I;
Chris@82 944 T1j = FNMS(KP951056516, T1i, KP587785252 * T13);
Chris@82 945 T1o = FNMS(KP951056516, T1n, KP587785252 * T1m);
Chris@82 946 T1M = FMA(KP951056516, T1m, KP587785252 * T1n);
Chris@82 947 T1J = FMA(KP951056516, T13, KP587785252 * T1i);
Chris@82 948 {
Chris@82 949 E T1z, T1A, TM, TN;
Chris@82 950 T1z = FNMS(KP250000000, T1y, T1v);
Chris@82 951 T1A = KP559016994 * (T1w - T1x);
Chris@82 952 T1B = T1z - T1A;
Chris@82 953 T1N = T1A + T1z;
Chris@82 954 TM = FNMS(KP250000000, TL, TE);
Chris@82 955 TN = KP559016994 * (TH - TK);
Chris@82 956 TO = TM - TN;
Chris@82 957 T1I = TN + TM;
Chris@82 958 }
Chris@82 959 {
Chris@82 960 E T1k, T1C, TD, T1l;
Chris@82 961 T1k = TO - T1j;
Chris@82 962 T1C = T1o + T1B;
Chris@82 963 TD = W[2];
Chris@82 964 T1l = W[3];
Chris@82 965 cr[WS(rs, 2)] = FNMS(T1l, T1C, TD * T1k);
Chris@82 966 ci[WS(rs, 2)] = FMA(T1l, T1k, TD * T1C);
Chris@82 967 }
Chris@82 968 {
Chris@82 969 E T1Q, T1S, T1P, T1R;
Chris@82 970 T1Q = T1I + T1J;
Chris@82 971 T1S = T1N - T1M;
Chris@82 972 T1P = W[26];
Chris@82 973 T1R = W[27];
Chris@82 974 cr[WS(rs, 14)] = FNMS(T1R, T1S, T1P * T1Q);
Chris@82 975 ci[WS(rs, 14)] = FMA(T1R, T1Q, T1P * T1S);
Chris@82 976 }
Chris@82 977 {
Chris@82 978 E T1E, T1G, T1D, T1F;
Chris@82 979 T1E = TO + T1j;
Chris@82 980 T1G = T1B - T1o;
Chris@82 981 T1D = W[34];
Chris@82 982 T1F = W[35];
Chris@82 983 cr[WS(rs, 18)] = FNMS(T1F, T1G, T1D * T1E);
Chris@82 984 ci[WS(rs, 18)] = FMA(T1F, T1E, T1D * T1G);
Chris@82 985 }
Chris@82 986 {
Chris@82 987 E T1K, T1O, T1H, T1L;
Chris@82 988 T1K = T1I - T1J;
Chris@82 989 T1O = T1M + T1N;
Chris@82 990 T1H = W[10];
Chris@82 991 T1L = W[11];
Chris@82 992 cr[WS(rs, 6)] = FNMS(T1L, T1O, T1H * T1K);
Chris@82 993 ci[WS(rs, 6)] = FMA(T1L, T1K, T1H * T1O);
Chris@82 994 }
Chris@82 995 }
Chris@82 996 {
Chris@82 997 E T2Q, T3p, T3B, T3x, T3m, T3A, T3b, T3w;
Chris@82 998 T2Q = FNMS(KP951056516, T2P, KP587785252 * T2I);
Chris@82 999 T3p = FNMS(KP951056516, T3o, KP587785252 * T3n);
Chris@82 1000 T3B = FMA(KP951056516, T3n, KP587785252 * T3o);
Chris@82 1001 T3x = FMA(KP951056516, T2I, KP587785252 * T2P);
Chris@82 1002 {
Chris@82 1003 E T3k, T3l, T39, T3a;
Chris@82 1004 T3k = FNMS(KP250000000, T3j, T3g);
Chris@82 1005 T3l = KP559016994 * (T3h - T3i);
Chris@82 1006 T3m = T3k - T3l;
Chris@82 1007 T3A = T3l + T3k;
Chris@82 1008 T39 = FNMS(KP250000000, T38, T2T);
Chris@82 1009 T3a = KP559016994 * (T30 - T37);
Chris@82 1010 T3b = T39 - T3a;
Chris@82 1011 T3w = T3a + T39;
Chris@82 1012 }
Chris@82 1013 {
Chris@82 1014 E T3c, T3q, T2B, T3d;
Chris@82 1015 T3c = T2Q + T3b;
Chris@82 1016 T3q = T3m - T3p;
Chris@82 1017 T2B = W[4];
Chris@82 1018 T3d = W[5];
Chris@82 1019 cr[WS(rs, 3)] = FNMS(T3d, T3q, T2B * T3c);
Chris@82 1020 ci[WS(rs, 3)] = FMA(T2B, T3q, T3d * T3c);
Chris@82 1021 }
Chris@82 1022 {
Chris@82 1023 E T3E, T3G, T3D, T3F;
Chris@82 1024 T3E = T3x + T3w;
Chris@82 1025 T3G = T3A - T3B;
Chris@82 1026 T3D = W[36];
Chris@82 1027 T3F = W[37];
Chris@82 1028 cr[WS(rs, 19)] = FNMS(T3F, T3G, T3D * T3E);
Chris@82 1029 ci[WS(rs, 19)] = FMA(T3D, T3G, T3F * T3E);
Chris@82 1030 }
Chris@82 1031 {
Chris@82 1032 E T3s, T3u, T3r, T3t;
Chris@82 1033 T3s = T3b - T2Q;
Chris@82 1034 T3u = T3m + T3p;
Chris@82 1035 T3r = W[12];
Chris@82 1036 T3t = W[13];
Chris@82 1037 cr[WS(rs, 7)] = FNMS(T3t, T3u, T3r * T3s);
Chris@82 1038 ci[WS(rs, 7)] = FMA(T3r, T3u, T3t * T3s);
Chris@82 1039 }
Chris@82 1040 {
Chris@82 1041 E T3y, T3C, T3v, T3z;
Chris@82 1042 T3y = T3w - T3x;
Chris@82 1043 T3C = T3A + T3B;
Chris@82 1044 T3v = W[20];
Chris@82 1045 T3z = W[21];
Chris@82 1046 cr[WS(rs, 11)] = FNMS(T3z, T3C, T3v * T3y);
Chris@82 1047 ci[WS(rs, 11)] = FMA(T3v, T3C, T3z * T3y);
Chris@82 1048 }
Chris@82 1049 }
Chris@82 1050 }
Chris@82 1051 }
Chris@82 1052 }
Chris@82 1053
Chris@82 1054 static const tw_instr twinstr[] = {
Chris@82 1055 {TW_FULL, 1, 20},
Chris@82 1056 {TW_NEXT, 1, 0}
Chris@82 1057 };
Chris@82 1058
Chris@82 1059 static const hc2hc_desc desc = { 20, "hb_20", twinstr, &GENUS, {184, 62, 62, 0} };
Chris@82 1060
Chris@82 1061 void X(codelet_hb_20) (planner *p) {
Chris@82 1062 X(khc2hc_register) (p, hb_20, &desc);
Chris@82 1063 }
Chris@82 1064 #endif