annotate src/fftw-3.3.8/rdft/scalar/r2cb/hb2_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:40 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 20 -dif -name hb2_20 -include rdft/scalar/hb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 276 FP additions, 198 FP multiplications,
Chris@82 32 * (or, 136 additions, 58 multiplications, 140 fused multiply/add),
Chris@82 33 * 129 stack variables, 4 constants, and 80 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hb.h"
Chris@82 36
Chris@82 37 static void hb2_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 46 E TD, TH, TE, T1L, T1N, T1X, TG, T29, TI, T2b, T1V, T1O, T24, T36, T5b;
Chris@82 47 E T1S, T1Y, T3b, T3e, T2o, T2Y, T2U, T31, T2s, T4y, T4u, T2f, T2c, T2g, T5g;
Chris@82 48 E T2k, T1s, T48, T4c, T5q, T5m, T4k, T4f;
Chris@82 49 {
Chris@82 50 E T1r, T1M, T2T, T1R, T2X, T23, T2r, T1W, T2n, T2a, TF, T4x;
Chris@82 51 TD = W[0];
Chris@82 52 TH = W[3];
Chris@82 53 TE = W[2];
Chris@82 54 TF = TD * TE;
Chris@82 55 T1r = TD * TH;
Chris@82 56 T1L = W[6];
Chris@82 57 T1M = TD * T1L;
Chris@82 58 T2T = TE * T1L;
Chris@82 59 T1N = W[7];
Chris@82 60 T1R = TD * T1N;
Chris@82 61 T2X = TE * T1N;
Chris@82 62 T1X = W[5];
Chris@82 63 T23 = TE * T1X;
Chris@82 64 T2r = TD * T1X;
Chris@82 65 TG = W[1];
Chris@82 66 T29 = FNMS(TG, TH, TF);
Chris@82 67 TI = FMA(TG, TH, TF);
Chris@82 68 T2b = FMA(TG, TE, T1r);
Chris@82 69 T1V = W[4];
Chris@82 70 T1W = TE * T1V;
Chris@82 71 T2n = TD * T1V;
Chris@82 72 T2a = T29 * T1V;
Chris@82 73 T1O = FMA(TG, T1N, T1M);
Chris@82 74 T24 = FNMS(TH, T1V, T23);
Chris@82 75 T36 = FNMS(TG, T1V, T2r);
Chris@82 76 T5b = FNMS(T2b, T1X, T2a);
Chris@82 77 T1S = FNMS(TG, T1L, T1R);
Chris@82 78 T1Y = FMA(TH, T1X, T1W);
Chris@82 79 T3b = FNMS(TH, T1X, T1W);
Chris@82 80 T3e = FMA(TH, T1V, T23);
Chris@82 81 T2o = FNMS(TG, T1X, T2n);
Chris@82 82 T2Y = FNMS(TH, T1L, T2X);
Chris@82 83 T2U = FMA(TH, T1N, T2T);
Chris@82 84 T31 = FMA(TG, T1X, T2n);
Chris@82 85 T2s = FMA(TG, T1V, T2r);
Chris@82 86 T4x = T29 * T1N;
Chris@82 87 T4y = FNMS(T2b, T1L, T4x);
Chris@82 88 {
Chris@82 89 E T4t, T2e, T2d, T2j;
Chris@82 90 T4t = T29 * T1L;
Chris@82 91 T4u = FMA(T2b, T1N, T4t);
Chris@82 92 T2e = T29 * T1X;
Chris@82 93 T2f = FNMS(T2b, T1V, T2e);
Chris@82 94 T2c = FMA(T2b, T1X, T2a);
Chris@82 95 T2d = T2c * T1L;
Chris@82 96 T2j = T2c * T1N;
Chris@82 97 T2g = FMA(T2f, T1N, T2d);
Chris@82 98 T5g = FMA(T2b, T1V, T2e);
Chris@82 99 T2k = FNMS(T2f, T1L, T2j);
Chris@82 100 {
Chris@82 101 E T47, T5p, T4b, T5l;
Chris@82 102 T47 = TI * T1V;
Chris@82 103 T5p = TI * T1N;
Chris@82 104 T4b = TI * T1X;
Chris@82 105 T5l = TI * T1L;
Chris@82 106 T1s = FNMS(TG, TE, T1r);
Chris@82 107 T48 = FMA(T1s, T1X, T47);
Chris@82 108 T4c = FNMS(T1s, T1V, T4b);
Chris@82 109 T5q = FNMS(T1s, T1L, T5p);
Chris@82 110 T5m = FMA(T1s, T1N, T5l);
Chris@82 111 T4k = FMA(T1s, T1V, T4b);
Chris@82 112 T4f = FNMS(T1s, T1X, T47);
Chris@82 113 }
Chris@82 114 }
Chris@82 115 }
Chris@82 116 {
Chris@82 117 E T7, T4B, T4V, TJ, T1z, T3j, T3V, T2H, T18, T42, T43, T1n, T2D, T53, T52;
Chris@82 118 E T2A, T1H, T4R, T4O, T1G, T2O, T3I, T2P, T3P, T2I, T2J, T2K, T1A, T1B, T1C;
Chris@82 119 E TC, T2w, T3Y, T40, T4I, T4K, TQ, TS, T3y, T3A, T4Y, T50;
Chris@82 120 {
Chris@82 121 E T3, T3h, T1y, T3i, T6, T3U, T1v, T3T;
Chris@82 122 {
Chris@82 123 E T1, T2, T1w, T1x;
Chris@82 124 T1 = cr[0];
Chris@82 125 T2 = ci[WS(rs, 9)];
Chris@82 126 T3 = T1 + T2;
Chris@82 127 T3h = T1 - T2;
Chris@82 128 T1w = ci[WS(rs, 14)];
Chris@82 129 T1x = cr[WS(rs, 15)];
Chris@82 130 T1y = T1w - T1x;
Chris@82 131 T3i = T1w + T1x;
Chris@82 132 }
Chris@82 133 {
Chris@82 134 E T4, T5, T1t, T1u;
Chris@82 135 T4 = cr[WS(rs, 5)];
Chris@82 136 T5 = ci[WS(rs, 4)];
Chris@82 137 T6 = T4 + T5;
Chris@82 138 T3U = T4 - T5;
Chris@82 139 T1t = ci[WS(rs, 19)];
Chris@82 140 T1u = cr[WS(rs, 10)];
Chris@82 141 T1v = T1t - T1u;
Chris@82 142 T3T = T1t + T1u;
Chris@82 143 }
Chris@82 144 T7 = T3 + T6;
Chris@82 145 T4B = T3h - T3i;
Chris@82 146 T4V = T3U + T3T;
Chris@82 147 TJ = T3 - T6;
Chris@82 148 T1z = T1v - T1y;
Chris@82 149 T3j = T3h + T3i;
Chris@82 150 T3V = T3T - T3U;
Chris@82 151 T2H = T1v + T1y;
Chris@82 152 }
Chris@82 153 {
Chris@82 154 E Te, T4C, T4M, TK, T1f, T3m, T3L, T2y, TA, T4G, T4Q, TO, T17, T3w, T3H;
Chris@82 155 E T2C, Tl, T4D, T4N, TL, T1m, T3p, T3O, T2z, Tt, T4F, T4P, TN, T10, T3t;
Chris@82 156 E T3E, T2B;
Chris@82 157 {
Chris@82 158 E Ta, T3k, T1e, T3l, Td, T3K, T1b, T3J;
Chris@82 159 {
Chris@82 160 E T8, T9, T1c, T1d;
Chris@82 161 T8 = cr[WS(rs, 4)];
Chris@82 162 T9 = ci[WS(rs, 5)];
Chris@82 163 Ta = T8 + T9;
Chris@82 164 T3k = T8 - T9;
Chris@82 165 T1c = ci[WS(rs, 10)];
Chris@82 166 T1d = cr[WS(rs, 19)];
Chris@82 167 T1e = T1c - T1d;
Chris@82 168 T3l = T1c + T1d;
Chris@82 169 }
Chris@82 170 {
Chris@82 171 E Tb, Tc, T19, T1a;
Chris@82 172 Tb = cr[WS(rs, 9)];
Chris@82 173 Tc = ci[0];
Chris@82 174 Td = Tb + Tc;
Chris@82 175 T3K = Tb - Tc;
Chris@82 176 T19 = ci[WS(rs, 15)];
Chris@82 177 T1a = cr[WS(rs, 14)];
Chris@82 178 T1b = T19 - T1a;
Chris@82 179 T3J = T19 + T1a;
Chris@82 180 }
Chris@82 181 Te = Ta + Td;
Chris@82 182 T4C = T3k - T3l;
Chris@82 183 T4M = T3K + T3J;
Chris@82 184 TK = Ta - Td;
Chris@82 185 T1f = T1b - T1e;
Chris@82 186 T3m = T3k + T3l;
Chris@82 187 T3L = T3J - T3K;
Chris@82 188 T2y = T1b + T1e;
Chris@82 189 }
Chris@82 190 {
Chris@82 191 E Tw, T3u, Tz, T3F, T13, T3G, T16, T3v;
Chris@82 192 {
Chris@82 193 E Tu, Tv, Tx, Ty;
Chris@82 194 Tu = ci[WS(rs, 7)];
Chris@82 195 Tv = cr[WS(rs, 2)];
Chris@82 196 Tw = Tu + Tv;
Chris@82 197 T3u = Tu - Tv;
Chris@82 198 Tx = ci[WS(rs, 2)];
Chris@82 199 Ty = cr[WS(rs, 7)];
Chris@82 200 Tz = Tx + Ty;
Chris@82 201 T3F = Tx - Ty;
Chris@82 202 }
Chris@82 203 {
Chris@82 204 E T11, T12, T14, T15;
Chris@82 205 T11 = ci[WS(rs, 17)];
Chris@82 206 T12 = cr[WS(rs, 12)];
Chris@82 207 T13 = T11 - T12;
Chris@82 208 T3G = T11 + T12;
Chris@82 209 T14 = ci[WS(rs, 12)];
Chris@82 210 T15 = cr[WS(rs, 17)];
Chris@82 211 T16 = T14 - T15;
Chris@82 212 T3v = T14 + T15;
Chris@82 213 }
Chris@82 214 TA = Tw + Tz;
Chris@82 215 T4G = T3u + T3v;
Chris@82 216 T4Q = T3F - T3G;
Chris@82 217 TO = Tw - Tz;
Chris@82 218 T17 = T13 - T16;
Chris@82 219 T3w = T3u - T3v;
Chris@82 220 T3H = T3F + T3G;
Chris@82 221 T2C = T13 + T16;
Chris@82 222 }
Chris@82 223 {
Chris@82 224 E Th, T3n, T1l, T3o, Tk, T3M, T1i, T3N;
Chris@82 225 {
Chris@82 226 E Tf, Tg, T1j, T1k;
Chris@82 227 Tf = ci[WS(rs, 3)];
Chris@82 228 Tg = cr[WS(rs, 6)];
Chris@82 229 Th = Tf + Tg;
Chris@82 230 T3n = Tf - Tg;
Chris@82 231 T1j = ci[WS(rs, 18)];
Chris@82 232 T1k = cr[WS(rs, 11)];
Chris@82 233 T1l = T1j - T1k;
Chris@82 234 T3o = T1j + T1k;
Chris@82 235 }
Chris@82 236 {
Chris@82 237 E Ti, Tj, T1g, T1h;
Chris@82 238 Ti = cr[WS(rs, 1)];
Chris@82 239 Tj = ci[WS(rs, 8)];
Chris@82 240 Tk = Ti + Tj;
Chris@82 241 T3M = Ti - Tj;
Chris@82 242 T1g = ci[WS(rs, 13)];
Chris@82 243 T1h = cr[WS(rs, 16)];
Chris@82 244 T1i = T1g - T1h;
Chris@82 245 T3N = T1g + T1h;
Chris@82 246 }
Chris@82 247 Tl = Th + Tk;
Chris@82 248 T4D = T3n - T3o;
Chris@82 249 T4N = T3M - T3N;
Chris@82 250 TL = Th - Tk;
Chris@82 251 T1m = T1i - T1l;
Chris@82 252 T3p = T3n + T3o;
Chris@82 253 T3O = T3M + T3N;
Chris@82 254 T2z = T1i + T1l;
Chris@82 255 }
Chris@82 256 {
Chris@82 257 E Tp, T3r, TZ, T3s, Ts, T3D, TW, T3C;
Chris@82 258 {
Chris@82 259 E Tn, To, TX, TY;
Chris@82 260 Tn = cr[WS(rs, 8)];
Chris@82 261 To = ci[WS(rs, 1)];
Chris@82 262 Tp = Tn + To;
Chris@82 263 T3r = Tn - To;
Chris@82 264 TX = ci[WS(rs, 16)];
Chris@82 265 TY = cr[WS(rs, 13)];
Chris@82 266 TZ = TX - TY;
Chris@82 267 T3s = TX + TY;
Chris@82 268 }
Chris@82 269 {
Chris@82 270 E Tq, Tr, TU, TV;
Chris@82 271 Tq = ci[WS(rs, 6)];
Chris@82 272 Tr = cr[WS(rs, 3)];
Chris@82 273 Ts = Tq + Tr;
Chris@82 274 T3D = Tq - Tr;
Chris@82 275 TU = ci[WS(rs, 11)];
Chris@82 276 TV = cr[WS(rs, 18)];
Chris@82 277 TW = TU - TV;
Chris@82 278 T3C = TU + TV;
Chris@82 279 }
Chris@82 280 Tt = Tp + Ts;
Chris@82 281 T4F = T3r + T3s;
Chris@82 282 T4P = T3D + T3C;
Chris@82 283 TN = Tp - Ts;
Chris@82 284 T10 = TW - TZ;
Chris@82 285 T3t = T3r - T3s;
Chris@82 286 T3E = T3C - T3D;
Chris@82 287 T2B = TW + TZ;
Chris@82 288 }
Chris@82 289 T18 = T10 - T17;
Chris@82 290 T42 = T3t - T3w;
Chris@82 291 T43 = T3m - T3p;
Chris@82 292 T1n = T1f - T1m;
Chris@82 293 T2D = T2B - T2C;
Chris@82 294 T53 = T4F - T4G;
Chris@82 295 T52 = T4C - T4D;
Chris@82 296 T2A = T2y - T2z;
Chris@82 297 T1H = TK - TL;
Chris@82 298 T4R = T4P - T4Q;
Chris@82 299 T4O = T4M - T4N;
Chris@82 300 T1G = TN - TO;
Chris@82 301 T2O = Te - Tl;
Chris@82 302 T3I = T3E + T3H;
Chris@82 303 T2P = Tt - TA;
Chris@82 304 T3P = T3L + T3O;
Chris@82 305 T2I = T2y + T2z;
Chris@82 306 T2J = T2B + T2C;
Chris@82 307 T2K = T2I + T2J;
Chris@82 308 T1A = T1f + T1m;
Chris@82 309 T1B = T10 + T17;
Chris@82 310 T1C = T1A + T1B;
Chris@82 311 {
Chris@82 312 E Tm, TB, TM, TP;
Chris@82 313 Tm = Te + Tl;
Chris@82 314 TB = Tt + TA;
Chris@82 315 TC = Tm + TB;
Chris@82 316 T2w = Tm - TB;
Chris@82 317 {
Chris@82 318 E T3W, T3X, T4E, T4H;
Chris@82 319 T3W = T3L - T3O;
Chris@82 320 T3X = T3E - T3H;
Chris@82 321 T3Y = T3W + T3X;
Chris@82 322 T40 = T3W - T3X;
Chris@82 323 T4E = T4C + T4D;
Chris@82 324 T4H = T4F + T4G;
Chris@82 325 T4I = T4E + T4H;
Chris@82 326 T4K = T4E - T4H;
Chris@82 327 }
Chris@82 328 TM = TK + TL;
Chris@82 329 TP = TN + TO;
Chris@82 330 TQ = TM + TP;
Chris@82 331 TS = TM - TP;
Chris@82 332 {
Chris@82 333 E T3q, T3x, T4W, T4X;
Chris@82 334 T3q = T3m + T3p;
Chris@82 335 T3x = T3t + T3w;
Chris@82 336 T3y = T3q + T3x;
Chris@82 337 T3A = T3q - T3x;
Chris@82 338 T4W = T4M + T4N;
Chris@82 339 T4X = T4P + T4Q;
Chris@82 340 T4Y = T4W + T4X;
Chris@82 341 T50 = T4W - T4X;
Chris@82 342 }
Chris@82 343 }
Chris@82 344 }
Chris@82 345 cr[0] = T7 + TC;
Chris@82 346 ci[0] = T2H + T2K;
Chris@82 347 {
Chris@82 348 E T2t, T2q, T2u, T2p;
Chris@82 349 T2t = T1z + T1C;
Chris@82 350 T2p = TJ + TQ;
Chris@82 351 T2q = T2o * T2p;
Chris@82 352 T2u = T2s * T2p;
Chris@82 353 cr[WS(rs, 10)] = FNMS(T2s, T2t, T2q);
Chris@82 354 ci[WS(rs, 10)] = FMA(T2o, T2t, T2u);
Chris@82 355 }
Chris@82 356 {
Chris@82 357 E T5t, T5u, T5v, T5w;
Chris@82 358 T5t = T4B + T4I;
Chris@82 359 T5u = T2c * T5t;
Chris@82 360 T5v = T4V + T4Y;
Chris@82 361 T5w = T2c * T5v;
Chris@82 362 cr[WS(rs, 5)] = FNMS(T2f, T5v, T5u);
Chris@82 363 ci[WS(rs, 5)] = FMA(T2f, T5t, T5w);
Chris@82 364 }
Chris@82 365 {
Chris@82 366 E T4v, T4w, T4z, T4A;
Chris@82 367 T4v = T3j + T3y;
Chris@82 368 T4w = T4u * T4v;
Chris@82 369 T4z = T3V + T3Y;
Chris@82 370 T4A = T4u * T4z;
Chris@82 371 cr[WS(rs, 15)] = FNMS(T4y, T4z, T4w);
Chris@82 372 ci[WS(rs, 15)] = FMA(T4y, T4v, T4A);
Chris@82 373 }
Chris@82 374 {
Chris@82 375 E T3R, T4p, T49, T4i, T45, T4r, T4d, T4n;
Chris@82 376 {
Chris@82 377 E T3Q, T4h, T3B, T4g, T3z;
Chris@82 378 T3Q = FNMS(KP618033988, T3P, T3I);
Chris@82 379 T4h = FMA(KP618033988, T3I, T3P);
Chris@82 380 T3z = FNMS(KP250000000, T3y, T3j);
Chris@82 381 T3B = FNMS(KP559016994, T3A, T3z);
Chris@82 382 T4g = FMA(KP559016994, T3A, T3z);
Chris@82 383 T3R = FNMS(KP951056516, T3Q, T3B);
Chris@82 384 T4p = FMA(KP951056516, T4h, T4g);
Chris@82 385 T49 = FMA(KP951056516, T3Q, T3B);
Chris@82 386 T4i = FNMS(KP951056516, T4h, T4g);
Chris@82 387 }
Chris@82 388 {
Chris@82 389 E T44, T4m, T41, T4l, T3Z;
Chris@82 390 T44 = FNMS(KP618033988, T43, T42);
Chris@82 391 T4m = FMA(KP618033988, T42, T43);
Chris@82 392 T3Z = FNMS(KP250000000, T3Y, T3V);
Chris@82 393 T41 = FNMS(KP559016994, T40, T3Z);
Chris@82 394 T4l = FMA(KP559016994, T40, T3Z);
Chris@82 395 T45 = FMA(KP951056516, T44, T41);
Chris@82 396 T4r = FNMS(KP951056516, T4m, T4l);
Chris@82 397 T4d = FNMS(KP951056516, T44, T41);
Chris@82 398 T4n = FMA(KP951056516, T4m, T4l);
Chris@82 399 }
Chris@82 400 {
Chris@82 401 E T3S, T46, T4a, T4e;
Chris@82 402 T3S = TE * T3R;
Chris@82 403 cr[WS(rs, 3)] = FNMS(TH, T45, T3S);
Chris@82 404 T46 = TE * T45;
Chris@82 405 ci[WS(rs, 3)] = FMA(TH, T3R, T46);
Chris@82 406 T4a = T48 * T49;
Chris@82 407 cr[WS(rs, 7)] = FNMS(T4c, T4d, T4a);
Chris@82 408 T4e = T48 * T4d;
Chris@82 409 ci[WS(rs, 7)] = FMA(T4c, T49, T4e);
Chris@82 410 }
Chris@82 411 {
Chris@82 412 E T4j, T4o, T4q, T4s;
Chris@82 413 T4j = T4f * T4i;
Chris@82 414 cr[WS(rs, 11)] = FNMS(T4k, T4n, T4j);
Chris@82 415 T4o = T4f * T4n;
Chris@82 416 ci[WS(rs, 11)] = FMA(T4k, T4i, T4o);
Chris@82 417 T4q = T1L * T4p;
Chris@82 418 cr[WS(rs, 19)] = FNMS(T1N, T4r, T4q);
Chris@82 419 T4s = T1L * T4r;
Chris@82 420 ci[WS(rs, 19)] = FMA(T1N, T4p, T4s);
Chris@82 421 }
Chris@82 422 }
Chris@82 423 {
Chris@82 424 E T4T, T5n, T57, T5e, T55, T5r, T59, T5j;
Chris@82 425 {
Chris@82 426 E T4S, T5d, T4L, T5c, T4J;
Chris@82 427 T4S = FMA(KP618033988, T4R, T4O);
Chris@82 428 T5d = FNMS(KP618033988, T4O, T4R);
Chris@82 429 T4J = FNMS(KP250000000, T4I, T4B);
Chris@82 430 T4L = FMA(KP559016994, T4K, T4J);
Chris@82 431 T5c = FNMS(KP559016994, T4K, T4J);
Chris@82 432 T4T = FNMS(KP951056516, T4S, T4L);
Chris@82 433 T5n = FMA(KP951056516, T5d, T5c);
Chris@82 434 T57 = FMA(KP951056516, T4S, T4L);
Chris@82 435 T5e = FNMS(KP951056516, T5d, T5c);
Chris@82 436 }
Chris@82 437 {
Chris@82 438 E T54, T5i, T51, T5h, T4Z;
Chris@82 439 T54 = FMA(KP618033988, T53, T52);
Chris@82 440 T5i = FNMS(KP618033988, T52, T53);
Chris@82 441 T4Z = FNMS(KP250000000, T4Y, T4V);
Chris@82 442 T51 = FMA(KP559016994, T50, T4Z);
Chris@82 443 T5h = FNMS(KP559016994, T50, T4Z);
Chris@82 444 T55 = FMA(KP951056516, T54, T51);
Chris@82 445 T5r = FNMS(KP951056516, T5i, T5h);
Chris@82 446 T59 = FNMS(KP951056516, T54, T51);
Chris@82 447 T5j = FMA(KP951056516, T5i, T5h);
Chris@82 448 }
Chris@82 449 {
Chris@82 450 E T4U, T56, T58, T5a;
Chris@82 451 T4U = TD * T4T;
Chris@82 452 cr[WS(rs, 1)] = FNMS(TG, T55, T4U);
Chris@82 453 T56 = TD * T55;
Chris@82 454 ci[WS(rs, 1)] = FMA(TG, T4T, T56);
Chris@82 455 T58 = T1V * T57;
Chris@82 456 cr[WS(rs, 9)] = FNMS(T1X, T59, T58);
Chris@82 457 T5a = T1V * T59;
Chris@82 458 ci[WS(rs, 9)] = FMA(T1X, T57, T5a);
Chris@82 459 }
Chris@82 460 {
Chris@82 461 E T5f, T5k, T5o, T5s;
Chris@82 462 T5f = T5b * T5e;
Chris@82 463 cr[WS(rs, 13)] = FNMS(T5g, T5j, T5f);
Chris@82 464 T5k = T5b * T5j;
Chris@82 465 ci[WS(rs, 13)] = FMA(T5g, T5e, T5k);
Chris@82 466 T5o = T5m * T5n;
Chris@82 467 cr[WS(rs, 17)] = FNMS(T5q, T5r, T5o);
Chris@82 468 T5s = T5m * T5r;
Chris@82 469 ci[WS(rs, 17)] = FMA(T5q, T5n, T5s);
Chris@82 470 }
Chris@82 471 }
Chris@82 472 {
Chris@82 473 E T2Q, T38, T2N, T37, T2F, T3c, T2V, T34, T2L, T2M;
Chris@82 474 T2Q = FMA(KP618033988, T2P, T2O);
Chris@82 475 T38 = FNMS(KP618033988, T2O, T2P);
Chris@82 476 T2L = FNMS(KP250000000, T2K, T2H);
Chris@82 477 T2M = T2I - T2J;
Chris@82 478 T2N = FMA(KP559016994, T2M, T2L);
Chris@82 479 T37 = FNMS(KP559016994, T2M, T2L);
Chris@82 480 {
Chris@82 481 E T2E, T33, T2x, T32, T2v;
Chris@82 482 T2E = FMA(KP618033988, T2D, T2A);
Chris@82 483 T33 = FNMS(KP618033988, T2A, T2D);
Chris@82 484 T2v = FNMS(KP250000000, TC, T7);
Chris@82 485 T2x = FMA(KP559016994, T2w, T2v);
Chris@82 486 T32 = FNMS(KP559016994, T2w, T2v);
Chris@82 487 T2F = FMA(KP951056516, T2E, T2x);
Chris@82 488 T3c = FMA(KP951056516, T33, T32);
Chris@82 489 T2V = FNMS(KP951056516, T2E, T2x);
Chris@82 490 T34 = FNMS(KP951056516, T33, T32);
Chris@82 491 }
Chris@82 492 {
Chris@82 493 E T2G, T2S, T2R, T3d, T3g, T3f;
Chris@82 494 T2G = T29 * T2F;
Chris@82 495 T2S = T2b * T2F;
Chris@82 496 T2R = FNMS(KP951056516, T2Q, T2N);
Chris@82 497 cr[WS(rs, 4)] = FNMS(T2b, T2R, T2G);
Chris@82 498 ci[WS(rs, 4)] = FMA(T29, T2R, T2S);
Chris@82 499 T3d = T3b * T3c;
Chris@82 500 T3g = T3e * T3c;
Chris@82 501 T3f = FNMS(KP951056516, T38, T37);
Chris@82 502 cr[WS(rs, 12)] = FNMS(T3e, T3f, T3d);
Chris@82 503 ci[WS(rs, 12)] = FMA(T3b, T3f, T3g);
Chris@82 504 }
Chris@82 505 {
Chris@82 506 E T2W, T30, T2Z, T35, T3a, T39;
Chris@82 507 T2W = T2U * T2V;
Chris@82 508 T30 = T2Y * T2V;
Chris@82 509 T2Z = FMA(KP951056516, T2Q, T2N);
Chris@82 510 cr[WS(rs, 16)] = FNMS(T2Y, T2Z, T2W);
Chris@82 511 ci[WS(rs, 16)] = FMA(T2U, T2Z, T30);
Chris@82 512 T35 = T31 * T34;
Chris@82 513 T3a = T36 * T34;
Chris@82 514 T39 = FMA(KP951056516, T38, T37);
Chris@82 515 cr[WS(rs, 8)] = FNMS(T36, T39, T35);
Chris@82 516 ci[WS(rs, 8)] = FMA(T31, T39, T3a);
Chris@82 517 }
Chris@82 518 }
Chris@82 519 {
Chris@82 520 E T1I, T26, T1F, T25, T1p, T2h, T1P, T21, T1D, T1E;
Chris@82 521 T1I = FNMS(KP618033988, T1H, T1G);
Chris@82 522 T26 = FMA(KP618033988, T1G, T1H);
Chris@82 523 T1D = FNMS(KP250000000, T1C, T1z);
Chris@82 524 T1E = T1A - T1B;
Chris@82 525 T1F = FNMS(KP559016994, T1E, T1D);
Chris@82 526 T25 = FMA(KP559016994, T1E, T1D);
Chris@82 527 {
Chris@82 528 E T1o, T20, TT, T1Z, TR;
Chris@82 529 T1o = FNMS(KP618033988, T1n, T18);
Chris@82 530 T20 = FMA(KP618033988, T18, T1n);
Chris@82 531 TR = FNMS(KP250000000, TQ, TJ);
Chris@82 532 TT = FNMS(KP559016994, TS, TR);
Chris@82 533 T1Z = FMA(KP559016994, TS, TR);
Chris@82 534 T1p = FMA(KP951056516, T1o, TT);
Chris@82 535 T2h = FMA(KP951056516, T20, T1Z);
Chris@82 536 T1P = FNMS(KP951056516, T1o, TT);
Chris@82 537 T21 = FNMS(KP951056516, T20, T1Z);
Chris@82 538 }
Chris@82 539 {
Chris@82 540 E T1q, T1K, T1J, T2i, T2m, T2l;
Chris@82 541 T1q = TI * T1p;
Chris@82 542 T1K = T1s * T1p;
Chris@82 543 T1J = FNMS(KP951056516, T1I, T1F);
Chris@82 544 cr[WS(rs, 2)] = FNMS(T1s, T1J, T1q);
Chris@82 545 ci[WS(rs, 2)] = FMA(TI, T1J, T1K);
Chris@82 546 T2i = T2g * T2h;
Chris@82 547 T2m = T2k * T2h;
Chris@82 548 T2l = FNMS(KP951056516, T26, T25);
Chris@82 549 cr[WS(rs, 14)] = FNMS(T2k, T2l, T2i);
Chris@82 550 ci[WS(rs, 14)] = FMA(T2g, T2l, T2m);
Chris@82 551 }
Chris@82 552 {
Chris@82 553 E T1Q, T1U, T1T, T22, T28, T27;
Chris@82 554 T1Q = T1O * T1P;
Chris@82 555 T1U = T1S * T1P;
Chris@82 556 T1T = FMA(KP951056516, T1I, T1F);
Chris@82 557 cr[WS(rs, 18)] = FNMS(T1S, T1T, T1Q);
Chris@82 558 ci[WS(rs, 18)] = FMA(T1O, T1T, T1U);
Chris@82 559 T22 = T1Y * T21;
Chris@82 560 T28 = T24 * T21;
Chris@82 561 T27 = FMA(KP951056516, T26, T25);
Chris@82 562 cr[WS(rs, 6)] = FNMS(T24, T27, T22);
Chris@82 563 ci[WS(rs, 6)] = FMA(T1Y, T27, T28);
Chris@82 564 }
Chris@82 565 }
Chris@82 566 }
Chris@82 567 }
Chris@82 568 }
Chris@82 569 }
Chris@82 570
Chris@82 571 static const tw_instr twinstr[] = {
Chris@82 572 {TW_CEXP, 1, 1},
Chris@82 573 {TW_CEXP, 1, 3},
Chris@82 574 {TW_CEXP, 1, 9},
Chris@82 575 {TW_CEXP, 1, 19},
Chris@82 576 {TW_NEXT, 1, 0}
Chris@82 577 };
Chris@82 578
Chris@82 579 static const hc2hc_desc desc = { 20, "hb2_20", twinstr, &GENUS, {136, 58, 140, 0} };
Chris@82 580
Chris@82 581 void X(codelet_hb2_20) (planner *p) {
Chris@82 582 X(khc2hc_register) (p, hb2_20, &desc);
Chris@82 583 }
Chris@82 584 #else
Chris@82 585
Chris@82 586 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 20 -dif -name hb2_20 -include rdft/scalar/hb.h */
Chris@82 587
Chris@82 588 /*
Chris@82 589 * This function contains 276 FP additions, 164 FP multiplications,
Chris@82 590 * (or, 204 additions, 92 multiplications, 72 fused multiply/add),
Chris@82 591 * 137 stack variables, 4 constants, and 80 memory accesses
Chris@82 592 */
Chris@82 593 #include "rdft/scalar/hb.h"
Chris@82 594
Chris@82 595 static void hb2_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 596 {
Chris@82 597 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 598 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 599 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 600 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 601 {
Chris@82 602 INT m;
Chris@82 603 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 604 E TD, TG, TE, TH, TJ, T1t, T27, T25, T1T, T1R, T1V, T2j, T2Z, T21, T2X;
Chris@82 605 E T2T, T2n, T2P, T3V, T41, T3R, T3X, T29, T2c, T4H, T4L, T1L, T1M, T1N, T2d;
Chris@82 606 E T4R, T1P, T4P, T49, T2N, T2f, T47, T2L;
Chris@82 607 {
Chris@82 608 E T1U, T2l, T1Z, T2i, T1S, T2m, T20, T2h;
Chris@82 609 {
Chris@82 610 E TF, T1s, TI, T1r;
Chris@82 611 TD = W[0];
Chris@82 612 TG = W[1];
Chris@82 613 TE = W[2];
Chris@82 614 TH = W[3];
Chris@82 615 TF = TD * TE;
Chris@82 616 T1s = TG * TE;
Chris@82 617 TI = TG * TH;
Chris@82 618 T1r = TD * TH;
Chris@82 619 TJ = TF + TI;
Chris@82 620 T1t = T1r - T1s;
Chris@82 621 T27 = T1r + T1s;
Chris@82 622 T25 = TF - TI;
Chris@82 623 T1T = W[5];
Chris@82 624 T1U = TH * T1T;
Chris@82 625 T2l = TD * T1T;
Chris@82 626 T1Z = TE * T1T;
Chris@82 627 T2i = TG * T1T;
Chris@82 628 T1R = W[4];
Chris@82 629 T1S = TE * T1R;
Chris@82 630 T2m = TG * T1R;
Chris@82 631 T20 = TH * T1R;
Chris@82 632 T2h = TD * T1R;
Chris@82 633 }
Chris@82 634 T1V = T1S + T1U;
Chris@82 635 T2j = T2h - T2i;
Chris@82 636 T2Z = T1Z + T20;
Chris@82 637 T21 = T1Z - T20;
Chris@82 638 T2X = T1S - T1U;
Chris@82 639 T2T = T2l - T2m;
Chris@82 640 T2n = T2l + T2m;
Chris@82 641 T2P = T2h + T2i;
Chris@82 642 {
Chris@82 643 E T3T, T3U, T3P, T3Q;
Chris@82 644 T3T = TJ * T1T;
Chris@82 645 T3U = T1t * T1R;
Chris@82 646 T3V = T3T - T3U;
Chris@82 647 T41 = T3T + T3U;
Chris@82 648 T3P = TJ * T1R;
Chris@82 649 T3Q = T1t * T1T;
Chris@82 650 T3R = T3P + T3Q;
Chris@82 651 T3X = T3P - T3Q;
Chris@82 652 {
Chris@82 653 E T26, T28, T2a, T2b;
Chris@82 654 T26 = T25 * T1R;
Chris@82 655 T28 = T27 * T1T;
Chris@82 656 T29 = T26 + T28;
Chris@82 657 T2a = T25 * T1T;
Chris@82 658 T2b = T27 * T1R;
Chris@82 659 T2c = T2a - T2b;
Chris@82 660 T4H = T26 - T28;
Chris@82 661 T4L = T2a + T2b;
Chris@82 662 T1L = W[6];
Chris@82 663 T1M = W[7];
Chris@82 664 T1N = FMA(TD, T1L, TG * T1M);
Chris@82 665 T2d = FMA(T29, T1L, T2c * T1M);
Chris@82 666 T4R = FNMS(T1t, T1L, TJ * T1M);
Chris@82 667 T1P = FNMS(TG, T1L, TD * T1M);
Chris@82 668 T4P = FMA(TJ, T1L, T1t * T1M);
Chris@82 669 T49 = FNMS(T27, T1L, T25 * T1M);
Chris@82 670 T2N = FNMS(TH, T1L, TE * T1M);
Chris@82 671 T2f = FNMS(T2c, T1L, T29 * T1M);
Chris@82 672 T47 = FMA(T25, T1L, T27 * T1M);
Chris@82 673 T2L = FMA(TE, T1L, TH * T1M);
Chris@82 674 }
Chris@82 675 }
Chris@82 676 }
Chris@82 677 {
Chris@82 678 E T7, T4i, T4x, TK, T1D, T3i, T3E, T2D, T19, T3L, T3M, T1o, T2x, T4C, T4B;
Chris@82 679 E T2u, T1v, T4r, T4o, T1u, T2H, T37, T2I, T3e, T3p, T3w, T3x, Tm, TB, TC;
Chris@82 680 E T4u, T4v, T4y, T2A, T2B, T2E, T1E, T1F, T1G, T4d, T4g, T4j, T3F, T3G, T3H;
Chris@82 681 E TN, TQ, TR, T48, T4a;
Chris@82 682 {
Chris@82 683 E T3, T3g, T1C, T3h, T6, T3D, T1z, T3C;
Chris@82 684 {
Chris@82 685 E T1, T2, T1A, T1B;
Chris@82 686 T1 = cr[0];
Chris@82 687 T2 = ci[WS(rs, 9)];
Chris@82 688 T3 = T1 + T2;
Chris@82 689 T3g = T1 - T2;
Chris@82 690 T1A = ci[WS(rs, 14)];
Chris@82 691 T1B = cr[WS(rs, 15)];
Chris@82 692 T1C = T1A - T1B;
Chris@82 693 T3h = T1A + T1B;
Chris@82 694 }
Chris@82 695 {
Chris@82 696 E T4, T5, T1x, T1y;
Chris@82 697 T4 = cr[WS(rs, 5)];
Chris@82 698 T5 = ci[WS(rs, 4)];
Chris@82 699 T6 = T4 + T5;
Chris@82 700 T3D = T4 - T5;
Chris@82 701 T1x = ci[WS(rs, 19)];
Chris@82 702 T1y = cr[WS(rs, 10)];
Chris@82 703 T1z = T1x - T1y;
Chris@82 704 T3C = T1x + T1y;
Chris@82 705 }
Chris@82 706 T7 = T3 + T6;
Chris@82 707 T4i = T3g - T3h;
Chris@82 708 T4x = T3D + T3C;
Chris@82 709 TK = T3 - T6;
Chris@82 710 T1D = T1z - T1C;
Chris@82 711 T3i = T3g + T3h;
Chris@82 712 T3E = T3C - T3D;
Chris@82 713 T2D = T1z + T1C;
Chris@82 714 }
Chris@82 715 {
Chris@82 716 E Te, T4b, T4m, TL, T11, T33, T3l, T2s, TA, T4f, T4q, TP, T1n, T3d, T3v;
Chris@82 717 E T2w, Tl, T4c, T4n, TM, T18, T36, T3o, T2t, Tt, T4e, T4p, TO, T1g, T3a;
Chris@82 718 E T3s, T2v;
Chris@82 719 {
Chris@82 720 E Ta, T3j, T10, T3k, Td, T32, TX, T31;
Chris@82 721 {
Chris@82 722 E T8, T9, TY, TZ;
Chris@82 723 T8 = cr[WS(rs, 4)];
Chris@82 724 T9 = ci[WS(rs, 5)];
Chris@82 725 Ta = T8 + T9;
Chris@82 726 T3j = T8 - T9;
Chris@82 727 TY = ci[WS(rs, 10)];
Chris@82 728 TZ = cr[WS(rs, 19)];
Chris@82 729 T10 = TY - TZ;
Chris@82 730 T3k = TY + TZ;
Chris@82 731 }
Chris@82 732 {
Chris@82 733 E Tb, Tc, TV, TW;
Chris@82 734 Tb = cr[WS(rs, 9)];
Chris@82 735 Tc = ci[0];
Chris@82 736 Td = Tb + Tc;
Chris@82 737 T32 = Tb - Tc;
Chris@82 738 TV = ci[WS(rs, 15)];
Chris@82 739 TW = cr[WS(rs, 14)];
Chris@82 740 TX = TV - TW;
Chris@82 741 T31 = TV + TW;
Chris@82 742 }
Chris@82 743 Te = Ta + Td;
Chris@82 744 T4b = T3j - T3k;
Chris@82 745 T4m = T32 + T31;
Chris@82 746 TL = Ta - Td;
Chris@82 747 T11 = TX - T10;
Chris@82 748 T33 = T31 - T32;
Chris@82 749 T3l = T3j + T3k;
Chris@82 750 T2s = TX + T10;
Chris@82 751 }
Chris@82 752 {
Chris@82 753 E Tw, T3t, Tz, T3b, T1j, T3c, T1m, T3u;
Chris@82 754 {
Chris@82 755 E Tu, Tv, Tx, Ty;
Chris@82 756 Tu = ci[WS(rs, 7)];
Chris@82 757 Tv = cr[WS(rs, 2)];
Chris@82 758 Tw = Tu + Tv;
Chris@82 759 T3t = Tu - Tv;
Chris@82 760 Tx = ci[WS(rs, 2)];
Chris@82 761 Ty = cr[WS(rs, 7)];
Chris@82 762 Tz = Tx + Ty;
Chris@82 763 T3b = Tx - Ty;
Chris@82 764 }
Chris@82 765 {
Chris@82 766 E T1h, T1i, T1k, T1l;
Chris@82 767 T1h = ci[WS(rs, 17)];
Chris@82 768 T1i = cr[WS(rs, 12)];
Chris@82 769 T1j = T1h - T1i;
Chris@82 770 T3c = T1h + T1i;
Chris@82 771 T1k = ci[WS(rs, 12)];
Chris@82 772 T1l = cr[WS(rs, 17)];
Chris@82 773 T1m = T1k - T1l;
Chris@82 774 T3u = T1k + T1l;
Chris@82 775 }
Chris@82 776 TA = Tw + Tz;
Chris@82 777 T4f = T3t + T3u;
Chris@82 778 T4q = T3b - T3c;
Chris@82 779 TP = Tw - Tz;
Chris@82 780 T1n = T1j - T1m;
Chris@82 781 T3d = T3b + T3c;
Chris@82 782 T3v = T3t - T3u;
Chris@82 783 T2w = T1j + T1m;
Chris@82 784 }
Chris@82 785 {
Chris@82 786 E Th, T3m, T17, T3n, Tk, T34, T14, T35;
Chris@82 787 {
Chris@82 788 E Tf, Tg, T15, T16;
Chris@82 789 Tf = ci[WS(rs, 3)];
Chris@82 790 Tg = cr[WS(rs, 6)];
Chris@82 791 Th = Tf + Tg;
Chris@82 792 T3m = Tf - Tg;
Chris@82 793 T15 = ci[WS(rs, 18)];
Chris@82 794 T16 = cr[WS(rs, 11)];
Chris@82 795 T17 = T15 - T16;
Chris@82 796 T3n = T15 + T16;
Chris@82 797 }
Chris@82 798 {
Chris@82 799 E Ti, Tj, T12, T13;
Chris@82 800 Ti = cr[WS(rs, 1)];
Chris@82 801 Tj = ci[WS(rs, 8)];
Chris@82 802 Tk = Ti + Tj;
Chris@82 803 T34 = Ti - Tj;
Chris@82 804 T12 = ci[WS(rs, 13)];
Chris@82 805 T13 = cr[WS(rs, 16)];
Chris@82 806 T14 = T12 - T13;
Chris@82 807 T35 = T12 + T13;
Chris@82 808 }
Chris@82 809 Tl = Th + Tk;
Chris@82 810 T4c = T3m - T3n;
Chris@82 811 T4n = T34 - T35;
Chris@82 812 TM = Th - Tk;
Chris@82 813 T18 = T14 - T17;
Chris@82 814 T36 = T34 + T35;
Chris@82 815 T3o = T3m + T3n;
Chris@82 816 T2t = T14 + T17;
Chris@82 817 }
Chris@82 818 {
Chris@82 819 E Tp, T3q, T1f, T3r, Ts, T39, T1c, T38;
Chris@82 820 {
Chris@82 821 E Tn, To, T1d, T1e;
Chris@82 822 Tn = cr[WS(rs, 8)];
Chris@82 823 To = ci[WS(rs, 1)];
Chris@82 824 Tp = Tn + To;
Chris@82 825 T3q = Tn - To;
Chris@82 826 T1d = ci[WS(rs, 16)];
Chris@82 827 T1e = cr[WS(rs, 13)];
Chris@82 828 T1f = T1d - T1e;
Chris@82 829 T3r = T1d + T1e;
Chris@82 830 }
Chris@82 831 {
Chris@82 832 E Tq, Tr, T1a, T1b;
Chris@82 833 Tq = ci[WS(rs, 6)];
Chris@82 834 Tr = cr[WS(rs, 3)];
Chris@82 835 Ts = Tq + Tr;
Chris@82 836 T39 = Tq - Tr;
Chris@82 837 T1a = ci[WS(rs, 11)];
Chris@82 838 T1b = cr[WS(rs, 18)];
Chris@82 839 T1c = T1a - T1b;
Chris@82 840 T38 = T1a + T1b;
Chris@82 841 }
Chris@82 842 Tt = Tp + Ts;
Chris@82 843 T4e = T3q + T3r;
Chris@82 844 T4p = T39 + T38;
Chris@82 845 TO = Tp - Ts;
Chris@82 846 T1g = T1c - T1f;
Chris@82 847 T3a = T38 - T39;
Chris@82 848 T3s = T3q - T3r;
Chris@82 849 T2v = T1c + T1f;
Chris@82 850 }
Chris@82 851 T19 = T11 - T18;
Chris@82 852 T3L = T3l - T3o;
Chris@82 853 T3M = T3s - T3v;
Chris@82 854 T1o = T1g - T1n;
Chris@82 855 T2x = T2v - T2w;
Chris@82 856 T4C = T4e - T4f;
Chris@82 857 T4B = T4b - T4c;
Chris@82 858 T2u = T2s - T2t;
Chris@82 859 T1v = TO - TP;
Chris@82 860 T4r = T4p - T4q;
Chris@82 861 T4o = T4m - T4n;
Chris@82 862 T1u = TL - TM;
Chris@82 863 T2H = Te - Tl;
Chris@82 864 T37 = T33 + T36;
Chris@82 865 T2I = Tt - TA;
Chris@82 866 T3e = T3a + T3d;
Chris@82 867 T3p = T3l + T3o;
Chris@82 868 T3w = T3s + T3v;
Chris@82 869 T3x = T3p + T3w;
Chris@82 870 Tm = Te + Tl;
Chris@82 871 TB = Tt + TA;
Chris@82 872 TC = Tm + TB;
Chris@82 873 T4u = T4m + T4n;
Chris@82 874 T4v = T4p + T4q;
Chris@82 875 T4y = T4u + T4v;
Chris@82 876 T2A = T2s + T2t;
Chris@82 877 T2B = T2v + T2w;
Chris@82 878 T2E = T2A + T2B;
Chris@82 879 T1E = T11 + T18;
Chris@82 880 T1F = T1g + T1n;
Chris@82 881 T1G = T1E + T1F;
Chris@82 882 T4d = T4b + T4c;
Chris@82 883 T4g = T4e + T4f;
Chris@82 884 T4j = T4d + T4g;
Chris@82 885 T3F = T33 - T36;
Chris@82 886 T3G = T3a - T3d;
Chris@82 887 T3H = T3F + T3G;
Chris@82 888 TN = TL + TM;
Chris@82 889 TQ = TO + TP;
Chris@82 890 TR = TN + TQ;
Chris@82 891 }
Chris@82 892 cr[0] = T7 + TC;
Chris@82 893 ci[0] = T2D + T2E;
Chris@82 894 {
Chris@82 895 E T2k, T2o, T4T, T4U;
Chris@82 896 T2k = TK + TR;
Chris@82 897 T2o = T1D + T1G;
Chris@82 898 cr[WS(rs, 10)] = FNMS(T2n, T2o, T2j * T2k);
Chris@82 899 ci[WS(rs, 10)] = FMA(T2n, T2k, T2j * T2o);
Chris@82 900 T4T = T4i + T4j;
Chris@82 901 T4U = T4x + T4y;
Chris@82 902 cr[WS(rs, 5)] = FNMS(T2c, T4U, T29 * T4T);
Chris@82 903 ci[WS(rs, 5)] = FMA(T29, T4U, T2c * T4T);
Chris@82 904 }
Chris@82 905 T48 = T3i + T3x;
Chris@82 906 T4a = T3E + T3H;
Chris@82 907 cr[WS(rs, 15)] = FNMS(T49, T4a, T47 * T48);
Chris@82 908 ci[WS(rs, 15)] = FMA(T47, T4a, T49 * T48);
Chris@82 909 {
Chris@82 910 E T2y, T2J, T2V, T2R, T2G, T2U, T2r, T2Q;
Chris@82 911 T2y = FMA(KP951056516, T2u, KP587785252 * T2x);
Chris@82 912 T2J = FMA(KP951056516, T2H, KP587785252 * T2I);
Chris@82 913 T2V = FNMS(KP951056516, T2I, KP587785252 * T2H);
Chris@82 914 T2R = FNMS(KP951056516, T2x, KP587785252 * T2u);
Chris@82 915 {
Chris@82 916 E T2C, T2F, T2p, T2q;
Chris@82 917 T2C = KP559016994 * (T2A - T2B);
Chris@82 918 T2F = FNMS(KP250000000, T2E, T2D);
Chris@82 919 T2G = T2C + T2F;
Chris@82 920 T2U = T2F - T2C;
Chris@82 921 T2p = KP559016994 * (Tm - TB);
Chris@82 922 T2q = FNMS(KP250000000, TC, T7);
Chris@82 923 T2r = T2p + T2q;
Chris@82 924 T2Q = T2q - T2p;
Chris@82 925 }
Chris@82 926 {
Chris@82 927 E T2z, T2K, T2Y, T30;
Chris@82 928 T2z = T2r + T2y;
Chris@82 929 T2K = T2G - T2J;
Chris@82 930 cr[WS(rs, 4)] = FNMS(T27, T2K, T25 * T2z);
Chris@82 931 ci[WS(rs, 4)] = FMA(T27, T2z, T25 * T2K);
Chris@82 932 T2Y = T2Q - T2R;
Chris@82 933 T30 = T2V + T2U;
Chris@82 934 cr[WS(rs, 12)] = FNMS(T2Z, T30, T2X * T2Y);
Chris@82 935 ci[WS(rs, 12)] = FMA(T2Z, T2Y, T2X * T30);
Chris@82 936 }
Chris@82 937 {
Chris@82 938 E T2M, T2O, T2S, T2W;
Chris@82 939 T2M = T2r - T2y;
Chris@82 940 T2O = T2J + T2G;
Chris@82 941 cr[WS(rs, 16)] = FNMS(T2N, T2O, T2L * T2M);
Chris@82 942 ci[WS(rs, 16)] = FMA(T2N, T2M, T2L * T2O);
Chris@82 943 T2S = T2Q + T2R;
Chris@82 944 T2W = T2U - T2V;
Chris@82 945 cr[WS(rs, 8)] = FNMS(T2T, T2W, T2P * T2S);
Chris@82 946 ci[WS(rs, 8)] = FMA(T2T, T2S, T2P * T2W);
Chris@82 947 }
Chris@82 948 }
Chris@82 949 {
Chris@82 950 E T4s, T4D, T4N, T4I, T4A, T4M, T4l, T4J;
Chris@82 951 T4s = FMA(KP951056516, T4o, KP587785252 * T4r);
Chris@82 952 T4D = FMA(KP951056516, T4B, KP587785252 * T4C);
Chris@82 953 T4N = FNMS(KP951056516, T4C, KP587785252 * T4B);
Chris@82 954 T4I = FNMS(KP951056516, T4r, KP587785252 * T4o);
Chris@82 955 {
Chris@82 956 E T4w, T4z, T4h, T4k;
Chris@82 957 T4w = KP559016994 * (T4u - T4v);
Chris@82 958 T4z = FNMS(KP250000000, T4y, T4x);
Chris@82 959 T4A = T4w + T4z;
Chris@82 960 T4M = T4z - T4w;
Chris@82 961 T4h = KP559016994 * (T4d - T4g);
Chris@82 962 T4k = FNMS(KP250000000, T4j, T4i);
Chris@82 963 T4l = T4h + T4k;
Chris@82 964 T4J = T4k - T4h;
Chris@82 965 }
Chris@82 966 {
Chris@82 967 E T4t, T4E, T4Q, T4S;
Chris@82 968 T4t = T4l - T4s;
Chris@82 969 T4E = T4A + T4D;
Chris@82 970 cr[WS(rs, 1)] = FNMS(TG, T4E, TD * T4t);
Chris@82 971 ci[WS(rs, 1)] = FMA(TD, T4E, TG * T4t);
Chris@82 972 T4Q = T4J - T4I;
Chris@82 973 T4S = T4M + T4N;
Chris@82 974 cr[WS(rs, 17)] = FNMS(T4R, T4S, T4P * T4Q);
Chris@82 975 ci[WS(rs, 17)] = FMA(T4P, T4S, T4R * T4Q);
Chris@82 976 }
Chris@82 977 {
Chris@82 978 E T4F, T4G, T4K, T4O;
Chris@82 979 T4F = T4s + T4l;
Chris@82 980 T4G = T4A - T4D;
Chris@82 981 cr[WS(rs, 9)] = FNMS(T1T, T4G, T1R * T4F);
Chris@82 982 ci[WS(rs, 9)] = FMA(T1R, T4G, T1T * T4F);
Chris@82 983 T4K = T4I + T4J;
Chris@82 984 T4O = T4M - T4N;
Chris@82 985 cr[WS(rs, 13)] = FNMS(T4L, T4O, T4H * T4K);
Chris@82 986 ci[WS(rs, 13)] = FMA(T4H, T4O, T4L * T4K);
Chris@82 987 }
Chris@82 988 }
Chris@82 989 {
Chris@82 990 E T1p, T1w, T22, T1X, T1J, T23, TU, T1W;
Chris@82 991 T1p = FNMS(KP951056516, T1o, KP587785252 * T19);
Chris@82 992 T1w = FNMS(KP951056516, T1v, KP587785252 * T1u);
Chris@82 993 T22 = FMA(KP951056516, T1u, KP587785252 * T1v);
Chris@82 994 T1X = FMA(KP951056516, T19, KP587785252 * T1o);
Chris@82 995 {
Chris@82 996 E T1H, T1I, TS, TT;
Chris@82 997 T1H = FNMS(KP250000000, T1G, T1D);
Chris@82 998 T1I = KP559016994 * (T1E - T1F);
Chris@82 999 T1J = T1H - T1I;
Chris@82 1000 T23 = T1I + T1H;
Chris@82 1001 TS = FNMS(KP250000000, TR, TK);
Chris@82 1002 TT = KP559016994 * (TN - TQ);
Chris@82 1003 TU = TS - TT;
Chris@82 1004 T1W = TT + TS;
Chris@82 1005 }
Chris@82 1006 {
Chris@82 1007 E T1q, T1K, T2e, T2g;
Chris@82 1008 T1q = TU - T1p;
Chris@82 1009 T1K = T1w + T1J;
Chris@82 1010 cr[WS(rs, 2)] = FNMS(T1t, T1K, TJ * T1q);
Chris@82 1011 ci[WS(rs, 2)] = FMA(T1t, T1q, TJ * T1K);
Chris@82 1012 T2e = T1W + T1X;
Chris@82 1013 T2g = T23 - T22;
Chris@82 1014 cr[WS(rs, 14)] = FNMS(T2f, T2g, T2d * T2e);
Chris@82 1015 ci[WS(rs, 14)] = FMA(T2f, T2e, T2d * T2g);
Chris@82 1016 }
Chris@82 1017 {
Chris@82 1018 E T1O, T1Q, T1Y, T24;
Chris@82 1019 T1O = TU + T1p;
Chris@82 1020 T1Q = T1J - T1w;
Chris@82 1021 cr[WS(rs, 18)] = FNMS(T1P, T1Q, T1N * T1O);
Chris@82 1022 ci[WS(rs, 18)] = FMA(T1P, T1O, T1N * T1Q);
Chris@82 1023 T1Y = T1W - T1X;
Chris@82 1024 T24 = T22 + T23;
Chris@82 1025 cr[WS(rs, 6)] = FNMS(T21, T24, T1V * T1Y);
Chris@82 1026 ci[WS(rs, 6)] = FMA(T21, T1Y, T1V * T24);
Chris@82 1027 }
Chris@82 1028 }
Chris@82 1029 {
Chris@82 1030 E T3f, T3N, T43, T3Z, T3K, T42, T3A, T3Y;
Chris@82 1031 T3f = FNMS(KP951056516, T3e, KP587785252 * T37);
Chris@82 1032 T3N = FNMS(KP951056516, T3M, KP587785252 * T3L);
Chris@82 1033 T43 = FMA(KP951056516, T3L, KP587785252 * T3M);
Chris@82 1034 T3Z = FMA(KP951056516, T37, KP587785252 * T3e);
Chris@82 1035 {
Chris@82 1036 E T3I, T3J, T3y, T3z;
Chris@82 1037 T3I = FNMS(KP250000000, T3H, T3E);
Chris@82 1038 T3J = KP559016994 * (T3F - T3G);
Chris@82 1039 T3K = T3I - T3J;
Chris@82 1040 T42 = T3J + T3I;
Chris@82 1041 T3y = FNMS(KP250000000, T3x, T3i);
Chris@82 1042 T3z = KP559016994 * (T3p - T3w);
Chris@82 1043 T3A = T3y - T3z;
Chris@82 1044 T3Y = T3z + T3y;
Chris@82 1045 }
Chris@82 1046 {
Chris@82 1047 E T3B, T3O, T45, T46;
Chris@82 1048 T3B = T3f + T3A;
Chris@82 1049 T3O = T3K - T3N;
Chris@82 1050 cr[WS(rs, 3)] = FNMS(TH, T3O, TE * T3B);
Chris@82 1051 ci[WS(rs, 3)] = FMA(TE, T3O, TH * T3B);
Chris@82 1052 T45 = T3Z + T3Y;
Chris@82 1053 T46 = T42 - T43;
Chris@82 1054 cr[WS(rs, 19)] = FNMS(T1M, T46, T1L * T45);
Chris@82 1055 ci[WS(rs, 19)] = FMA(T1L, T46, T1M * T45);
Chris@82 1056 }
Chris@82 1057 {
Chris@82 1058 E T3S, T3W, T40, T44;
Chris@82 1059 T3S = T3A - T3f;
Chris@82 1060 T3W = T3K + T3N;
Chris@82 1061 cr[WS(rs, 7)] = FNMS(T3V, T3W, T3R * T3S);
Chris@82 1062 ci[WS(rs, 7)] = FMA(T3R, T3W, T3V * T3S);
Chris@82 1063 T40 = T3Y - T3Z;
Chris@82 1064 T44 = T42 + T43;
Chris@82 1065 cr[WS(rs, 11)] = FNMS(T41, T44, T3X * T40);
Chris@82 1066 ci[WS(rs, 11)] = FMA(T3X, T44, T41 * T40);
Chris@82 1067 }
Chris@82 1068 }
Chris@82 1069 }
Chris@82 1070 }
Chris@82 1071 }
Chris@82 1072 }
Chris@82 1073
Chris@82 1074 static const tw_instr twinstr[] = {
Chris@82 1075 {TW_CEXP, 1, 1},
Chris@82 1076 {TW_CEXP, 1, 3},
Chris@82 1077 {TW_CEXP, 1, 9},
Chris@82 1078 {TW_CEXP, 1, 19},
Chris@82 1079 {TW_NEXT, 1, 0}
Chris@82 1080 };
Chris@82 1081
Chris@82 1082 static const hc2hc_desc desc = { 20, "hb2_20", twinstr, &GENUS, {204, 92, 72, 0} };
Chris@82 1083
Chris@82 1084 void X(codelet_hb2_20) (planner *p) {
Chris@82 1085 X(khc2hc_register) (p, hb2_20, &desc);
Chris@82 1086 }
Chris@82 1087 #endif