annotate src/fftw-3.3.5/rdft/scalar/r2cb/hb_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:49:44 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hb_12 -include hb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 118 FP additions, 68 FP multiplications,
Chris@42 32 * (or, 72 additions, 22 multiplications, 46 fused multiply/add),
Chris@42 33 * 64 stack variables, 2 constants, and 48 memory accesses
Chris@42 34 */
Chris@42 35 #include "hb.h"
Chris@42 36
Chris@42 37 static void hb_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 41 {
Chris@42 42 INT m;
Chris@42 43 for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@42 44 E T1U, T1X, T1W, T1Y, T1V;
Chris@42 45 {
Chris@42 46 E T18, T20, T2a, T1s, T21, T1b, T29, T1p, TO, T11, To, Tb, Tg, T23, T1f;
Chris@42 47 E Ty, Tl, Tt, T1z, T2d, T1i, T24, T1w, T2c;
Chris@42 48 {
Chris@42 49 E T5, TN, Ta, TI;
Chris@42 50 {
Chris@42 51 E T1, TE, TM, T6, TJ, T1o, T4, T17, TH, TK, T7, T8;
Chris@42 52 T1 = cr[0];
Chris@42 53 TE = ci[WS(rs, 11)];
Chris@42 54 TM = cr[WS(rs, 6)];
Chris@42 55 T6 = ci[WS(rs, 5)];
Chris@42 56 {
Chris@42 57 E T2, T3, TF, TG;
Chris@42 58 T2 = cr[WS(rs, 4)];
Chris@42 59 T3 = ci[WS(rs, 3)];
Chris@42 60 TF = ci[WS(rs, 7)];
Chris@42 61 TG = cr[WS(rs, 8)];
Chris@42 62 TJ = ci[WS(rs, 9)];
Chris@42 63 T1o = T2 - T3;
Chris@42 64 T4 = T2 + T3;
Chris@42 65 T17 = TF + TG;
Chris@42 66 TH = TF - TG;
Chris@42 67 TK = cr[WS(rs, 10)];
Chris@42 68 T7 = ci[WS(rs, 1)];
Chris@42 69 T8 = cr[WS(rs, 2)];
Chris@42 70 }
Chris@42 71 {
Chris@42 72 E T1a, T1r, T1q, T19, TL, T9, T16, T1n;
Chris@42 73 T5 = T1 + T4;
Chris@42 74 T16 = FNMS(KP500000000, T4, T1);
Chris@42 75 T1a = TJ + TK;
Chris@42 76 TL = TJ - TK;
Chris@42 77 T1r = T7 - T8;
Chris@42 78 T9 = T7 + T8;
Chris@42 79 T18 = FNMS(KP866025403, T17, T16);
Chris@42 80 T20 = FMA(KP866025403, T17, T16);
Chris@42 81 T1q = FMA(KP500000000, TL, TM);
Chris@42 82 TN = TL - TM;
Chris@42 83 Ta = T6 + T9;
Chris@42 84 T19 = FNMS(KP500000000, T9, T6);
Chris@42 85 T1n = FNMS(KP500000000, TH, TE);
Chris@42 86 TI = TE + TH;
Chris@42 87 T2a = FMA(KP866025403, T1r, T1q);
Chris@42 88 T1s = FNMS(KP866025403, T1r, T1q);
Chris@42 89 T21 = FNMS(KP866025403, T1a, T19);
Chris@42 90 T1b = FMA(KP866025403, T1a, T19);
Chris@42 91 T29 = FNMS(KP866025403, T1o, T1n);
Chris@42 92 T1p = FMA(KP866025403, T1o, T1n);
Chris@42 93 }
Chris@42 94 }
Chris@42 95 {
Chris@42 96 E Tc, Tp, Tx, Th, Tu, Tf, T1v, Ts, T1e, Tv, Ti, Tj;
Chris@42 97 Tc = cr[WS(rs, 3)];
Chris@42 98 TO = TI - TN;
Chris@42 99 T11 = TI + TN;
Chris@42 100 Tp = ci[WS(rs, 8)];
Chris@42 101 To = T5 - Ta;
Chris@42 102 Tb = T5 + Ta;
Chris@42 103 Tx = cr[WS(rs, 9)];
Chris@42 104 Th = ci[WS(rs, 2)];
Chris@42 105 {
Chris@42 106 E Td, Te, Tq, Tr;
Chris@42 107 Td = ci[WS(rs, 4)];
Chris@42 108 Te = ci[0];
Chris@42 109 Tq = cr[WS(rs, 7)];
Chris@42 110 Tr = cr[WS(rs, 11)];
Chris@42 111 Tu = ci[WS(rs, 10)];
Chris@42 112 Tf = Td + Te;
Chris@42 113 T1v = Td - Te;
Chris@42 114 Ts = Tq + Tr;
Chris@42 115 T1e = Tq - Tr;
Chris@42 116 Tv = ci[WS(rs, 6)];
Chris@42 117 Ti = cr[WS(rs, 1)];
Chris@42 118 Tj = cr[WS(rs, 5)];
Chris@42 119 }
Chris@42 120 {
Chris@42 121 E T1h, T1y, T1x, T1g, Tw, Tk, T1d, T1u;
Chris@42 122 T1d = FNMS(KP500000000, Tf, Tc);
Chris@42 123 Tg = Tc + Tf;
Chris@42 124 Tw = Tu + Tv;
Chris@42 125 T1h = Tv - Tu;
Chris@42 126 Tk = Ti + Tj;
Chris@42 127 T1y = Ti - Tj;
Chris@42 128 T23 = FNMS(KP866025403, T1e, T1d);
Chris@42 129 T1f = FMA(KP866025403, T1e, T1d);
Chris@42 130 Ty = Tw - Tx;
Chris@42 131 T1x = FMA(KP500000000, Tw, Tx);
Chris@42 132 T1g = FNMS(KP500000000, Tk, Th);
Chris@42 133 Tl = Th + Tk;
Chris@42 134 Tt = Tp - Ts;
Chris@42 135 T1u = FMA(KP500000000, Ts, Tp);
Chris@42 136 T1z = FNMS(KP866025403, T1y, T1x);
Chris@42 137 T2d = FMA(KP866025403, T1y, T1x);
Chris@42 138 T1i = FMA(KP866025403, T1h, T1g);
Chris@42 139 T24 = FNMS(KP866025403, T1h, T1g);
Chris@42 140 T1w = FMA(KP866025403, T1v, T1u);
Chris@42 141 T2c = FNMS(KP866025403, T1v, T1u);
Chris@42 142 }
Chris@42 143 }
Chris@42 144 }
Chris@42 145 {
Chris@42 146 E TY, T13, TX, T10;
Chris@42 147 {
Chris@42 148 E Tn, T12, TC, Tm, TD, TS, TA, Tz;
Chris@42 149 Tn = W[16];
Chris@42 150 T12 = Tt + Ty;
Chris@42 151 Tz = Tt - Ty;
Chris@42 152 TC = W[17];
Chris@42 153 Tm = Tg + Tl;
Chris@42 154 TD = Tg - Tl;
Chris@42 155 TS = To + Tz;
Chris@42 156 TA = To - Tz;
Chris@42 157 {
Chris@42 158 E TV, TU, TW, TT;
Chris@42 159 {
Chris@42 160 E TQ, TR, TP, TB;
Chris@42 161 TV = TO - TD;
Chris@42 162 TP = TD + TO;
Chris@42 163 cr[0] = Tb + Tm;
Chris@42 164 TB = Tn * TA;
Chris@42 165 TQ = Tn * TP;
Chris@42 166 TR = W[4];
Chris@42 167 cr[WS(rs, 9)] = FNMS(TC, TP, TB);
Chris@42 168 TU = W[5];
Chris@42 169 ci[WS(rs, 9)] = FMA(TC, TA, TQ);
Chris@42 170 TW = TR * TV;
Chris@42 171 TT = TR * TS;
Chris@42 172 }
Chris@42 173 ci[WS(rs, 3)] = FMA(TU, TS, TW);
Chris@42 174 cr[WS(rs, 3)] = FNMS(TU, TV, TT);
Chris@42 175 TY = Tb - Tm;
Chris@42 176 T13 = T11 - T12;
Chris@42 177 TX = W[10];
Chris@42 178 T10 = W[11];
Chris@42 179 ci[0] = T11 + T12;
Chris@42 180 }
Chris@42 181 }
Chris@42 182 {
Chris@42 183 E T1K, T1Q, T1P, T1L, T2o, T2u, T2t, T2p;
Chris@42 184 {
Chris@42 185 E T1E, T1D, T1H, T1F, T1G, T1t, T1k, T1A;
Chris@42 186 {
Chris@42 187 E T1c, TZ, T14, T1j;
Chris@42 188 T1K = T18 - T1b;
Chris@42 189 T1c = T18 + T1b;
Chris@42 190 TZ = TX * TY;
Chris@42 191 T14 = T10 * TY;
Chris@42 192 T1j = T1f + T1i;
Chris@42 193 T1Q = T1f - T1i;
Chris@42 194 T1P = T1p + T1s;
Chris@42 195 T1t = T1p - T1s;
Chris@42 196 cr[WS(rs, 6)] = FNMS(T10, T13, TZ);
Chris@42 197 ci[WS(rs, 6)] = FMA(TX, T13, T14);
Chris@42 198 T1E = T1c + T1j;
Chris@42 199 T1k = T1c - T1j;
Chris@42 200 T1A = T1w - T1z;
Chris@42 201 T1L = T1w + T1z;
Chris@42 202 }
Chris@42 203 {
Chris@42 204 E T15, T1m, T1B, T1l, T1C;
Chris@42 205 T15 = W[18];
Chris@42 206 T1m = W[19];
Chris@42 207 T1D = W[6];
Chris@42 208 T1H = T1t + T1A;
Chris@42 209 T1B = T1t - T1A;
Chris@42 210 T1l = T15 * T1k;
Chris@42 211 T1C = T1m * T1k;
Chris@42 212 T1F = T1D * T1E;
Chris@42 213 T1G = W[7];
Chris@42 214 cr[WS(rs, 10)] = FNMS(T1m, T1B, T1l);
Chris@42 215 ci[WS(rs, 10)] = FMA(T15, T1B, T1C);
Chris@42 216 }
Chris@42 217 {
Chris@42 218 E T26, T2i, T2l, T2f, T1Z, T28;
Chris@42 219 {
Chris@42 220 E T22, T1I, T25, T2b, T2e;
Chris@42 221 T22 = T20 + T21;
Chris@42 222 T2o = T20 - T21;
Chris@42 223 cr[WS(rs, 4)] = FNMS(T1G, T1H, T1F);
Chris@42 224 T1I = T1G * T1E;
Chris@42 225 T2u = T23 - T24;
Chris@42 226 T25 = T23 + T24;
Chris@42 227 T2b = T29 - T2a;
Chris@42 228 T2t = T29 + T2a;
Chris@42 229 T2p = T2c + T2d;
Chris@42 230 T2e = T2c - T2d;
Chris@42 231 ci[WS(rs, 4)] = FMA(T1D, T1H, T1I);
Chris@42 232 T26 = T22 - T25;
Chris@42 233 T2i = T22 + T25;
Chris@42 234 T2l = T2b + T2e;
Chris@42 235 T2f = T2b - T2e;
Chris@42 236 }
Chris@42 237 T1Z = W[2];
Chris@42 238 T28 = W[3];
Chris@42 239 {
Chris@42 240 E T2h, T2k, T27, T2g, T2j, T2m;
Chris@42 241 T2h = W[14];
Chris@42 242 T2k = W[15];
Chris@42 243 T27 = T1Z * T26;
Chris@42 244 T2g = T28 * T26;
Chris@42 245 T2j = T2h * T2i;
Chris@42 246 T2m = T2k * T2i;
Chris@42 247 cr[WS(rs, 2)] = FNMS(T28, T2f, T27);
Chris@42 248 ci[WS(rs, 2)] = FMA(T1Z, T2f, T2g);
Chris@42 249 cr[WS(rs, 8)] = FNMS(T2k, T2l, T2j);
Chris@42 250 ci[WS(rs, 8)] = FMA(T2h, T2l, T2m);
Chris@42 251 }
Chris@42 252 }
Chris@42 253 }
Chris@42 254 {
Chris@42 255 E T2y, T2B, T2A, T2C, T2z;
Chris@42 256 {
Chris@42 257 E T2n, T2q, T2v, T2s, T2r, T2x, T2w;
Chris@42 258 T2n = W[8];
Chris@42 259 T2y = T2o + T2p;
Chris@42 260 T2q = T2o - T2p;
Chris@42 261 T2B = T2t - T2u;
Chris@42 262 T2v = T2t + T2u;
Chris@42 263 T2s = W[9];
Chris@42 264 T2r = T2n * T2q;
Chris@42 265 T2x = W[20];
Chris@42 266 T2w = T2n * T2v;
Chris@42 267 T2A = W[21];
Chris@42 268 cr[WS(rs, 5)] = FNMS(T2s, T2v, T2r);
Chris@42 269 T2C = T2x * T2B;
Chris@42 270 T2z = T2x * T2y;
Chris@42 271 ci[WS(rs, 5)] = FMA(T2s, T2q, T2w);
Chris@42 272 }
Chris@42 273 ci[WS(rs, 11)] = FMA(T2A, T2y, T2C);
Chris@42 274 cr[WS(rs, 11)] = FNMS(T2A, T2B, T2z);
Chris@42 275 {
Chris@42 276 E T1J, T1M, T1R, T1O, T1N, T1T, T1S;
Chris@42 277 T1J = W[0];
Chris@42 278 T1U = T1K + T1L;
Chris@42 279 T1M = T1K - T1L;
Chris@42 280 T1X = T1P - T1Q;
Chris@42 281 T1R = T1P + T1Q;
Chris@42 282 T1O = W[1];
Chris@42 283 T1N = T1J * T1M;
Chris@42 284 T1T = W[12];
Chris@42 285 T1S = T1J * T1R;
Chris@42 286 T1W = W[13];
Chris@42 287 cr[WS(rs, 1)] = FNMS(T1O, T1R, T1N);
Chris@42 288 T1Y = T1T * T1X;
Chris@42 289 T1V = T1T * T1U;
Chris@42 290 ci[WS(rs, 1)] = FMA(T1O, T1M, T1S);
Chris@42 291 }
Chris@42 292 }
Chris@42 293 }
Chris@42 294 }
Chris@42 295 }
Chris@42 296 ci[WS(rs, 7)] = FMA(T1W, T1U, T1Y);
Chris@42 297 cr[WS(rs, 7)] = FNMS(T1W, T1X, T1V);
Chris@42 298 }
Chris@42 299 }
Chris@42 300 }
Chris@42 301
Chris@42 302 static const tw_instr twinstr[] = {
Chris@42 303 {TW_FULL, 1, 12},
Chris@42 304 {TW_NEXT, 1, 0}
Chris@42 305 };
Chris@42 306
Chris@42 307 static const hc2hc_desc desc = { 12, "hb_12", twinstr, &GENUS, {72, 22, 46, 0} };
Chris@42 308
Chris@42 309 void X(codelet_hb_12) (planner *p) {
Chris@42 310 X(khc2hc_register) (p, hb_12, &desc);
Chris@42 311 }
Chris@42 312 #else /* HAVE_FMA */
Chris@42 313
Chris@42 314 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hb_12 -include hb.h */
Chris@42 315
Chris@42 316 /*
Chris@42 317 * This function contains 118 FP additions, 60 FP multiplications,
Chris@42 318 * (or, 88 additions, 30 multiplications, 30 fused multiply/add),
Chris@42 319 * 39 stack variables, 2 constants, and 48 memory accesses
Chris@42 320 */
Chris@42 321 #include "hb.h"
Chris@42 322
Chris@42 323 static void hb_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 324 {
Chris@42 325 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 326 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 327 {
Chris@42 328 INT m;
Chris@42 329 for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@42 330 E T5, TH, T12, T1M, T1i, T1U, Tg, Tt, T19, T1X, T1p, T1P, Ta, TM, T15;
Chris@42 331 E T1N, T1l, T1V, Tl, Ty, T1c, T1Y, T1s, T1Q;
Chris@42 332 {
Chris@42 333 E T1, TD, T4, T1g, TG, T11, T10, T1h;
Chris@42 334 T1 = cr[0];
Chris@42 335 TD = ci[WS(rs, 11)];
Chris@42 336 {
Chris@42 337 E T2, T3, TE, TF;
Chris@42 338 T2 = cr[WS(rs, 4)];
Chris@42 339 T3 = ci[WS(rs, 3)];
Chris@42 340 T4 = T2 + T3;
Chris@42 341 T1g = KP866025403 * (T2 - T3);
Chris@42 342 TE = ci[WS(rs, 7)];
Chris@42 343 TF = cr[WS(rs, 8)];
Chris@42 344 TG = TE - TF;
Chris@42 345 T11 = KP866025403 * (TE + TF);
Chris@42 346 }
Chris@42 347 T5 = T1 + T4;
Chris@42 348 TH = TD + TG;
Chris@42 349 T10 = FNMS(KP500000000, T4, T1);
Chris@42 350 T12 = T10 - T11;
Chris@42 351 T1M = T10 + T11;
Chris@42 352 T1h = FNMS(KP500000000, TG, TD);
Chris@42 353 T1i = T1g + T1h;
Chris@42 354 T1U = T1h - T1g;
Chris@42 355 }
Chris@42 356 {
Chris@42 357 E Tc, Tp, Tf, T17, Ts, T1o, T18, T1n;
Chris@42 358 Tc = cr[WS(rs, 3)];
Chris@42 359 Tp = ci[WS(rs, 8)];
Chris@42 360 {
Chris@42 361 E Td, Te, Tq, Tr;
Chris@42 362 Td = ci[WS(rs, 4)];
Chris@42 363 Te = ci[0];
Chris@42 364 Tf = Td + Te;
Chris@42 365 T17 = KP866025403 * (Td - Te);
Chris@42 366 Tq = cr[WS(rs, 7)];
Chris@42 367 Tr = cr[WS(rs, 11)];
Chris@42 368 Ts = Tq + Tr;
Chris@42 369 T1o = KP866025403 * (Tq - Tr);
Chris@42 370 }
Chris@42 371 Tg = Tc + Tf;
Chris@42 372 Tt = Tp - Ts;
Chris@42 373 T18 = FMA(KP500000000, Ts, Tp);
Chris@42 374 T19 = T17 + T18;
Chris@42 375 T1X = T18 - T17;
Chris@42 376 T1n = FNMS(KP500000000, Tf, Tc);
Chris@42 377 T1p = T1n + T1o;
Chris@42 378 T1P = T1n - T1o;
Chris@42 379 }
Chris@42 380 {
Chris@42 381 E T6, TL, T9, T1j, TK, T14, T13, T1k;
Chris@42 382 T6 = ci[WS(rs, 5)];
Chris@42 383 TL = cr[WS(rs, 6)];
Chris@42 384 {
Chris@42 385 E T7, T8, TI, TJ;
Chris@42 386 T7 = ci[WS(rs, 1)];
Chris@42 387 T8 = cr[WS(rs, 2)];
Chris@42 388 T9 = T7 + T8;
Chris@42 389 T1j = KP866025403 * (T7 - T8);
Chris@42 390 TI = ci[WS(rs, 9)];
Chris@42 391 TJ = cr[WS(rs, 10)];
Chris@42 392 TK = TI - TJ;
Chris@42 393 T14 = KP866025403 * (TI + TJ);
Chris@42 394 }
Chris@42 395 Ta = T6 + T9;
Chris@42 396 TM = TK - TL;
Chris@42 397 T13 = FNMS(KP500000000, T9, T6);
Chris@42 398 T15 = T13 + T14;
Chris@42 399 T1N = T13 - T14;
Chris@42 400 T1k = FMA(KP500000000, TK, TL);
Chris@42 401 T1l = T1j - T1k;
Chris@42 402 T1V = T1j + T1k;
Chris@42 403 }
Chris@42 404 {
Chris@42 405 E Th, Tx, Tk, T1a, Tw, T1r, T1b, T1q;
Chris@42 406 Th = ci[WS(rs, 2)];
Chris@42 407 Tx = cr[WS(rs, 9)];
Chris@42 408 {
Chris@42 409 E Ti, Tj, Tu, Tv;
Chris@42 410 Ti = cr[WS(rs, 1)];
Chris@42 411 Tj = cr[WS(rs, 5)];
Chris@42 412 Tk = Ti + Tj;
Chris@42 413 T1a = KP866025403 * (Ti - Tj);
Chris@42 414 Tu = ci[WS(rs, 10)];
Chris@42 415 Tv = ci[WS(rs, 6)];
Chris@42 416 Tw = Tu + Tv;
Chris@42 417 T1r = KP866025403 * (Tv - Tu);
Chris@42 418 }
Chris@42 419 Tl = Th + Tk;
Chris@42 420 Ty = Tw - Tx;
Chris@42 421 T1b = FMA(KP500000000, Tw, Tx);
Chris@42 422 T1c = T1a - T1b;
Chris@42 423 T1Y = T1a + T1b;
Chris@42 424 T1q = FNMS(KP500000000, Tk, Th);
Chris@42 425 T1s = T1q + T1r;
Chris@42 426 T1Q = T1q - T1r;
Chris@42 427 }
Chris@42 428 {
Chris@42 429 E Tb, Tm, TU, TW, TX, TY, TT, TV;
Chris@42 430 Tb = T5 + Ta;
Chris@42 431 Tm = Tg + Tl;
Chris@42 432 TU = Tb - Tm;
Chris@42 433 TW = TH + TM;
Chris@42 434 TX = Tt + Ty;
Chris@42 435 TY = TW - TX;
Chris@42 436 cr[0] = Tb + Tm;
Chris@42 437 ci[0] = TW + TX;
Chris@42 438 TT = W[10];
Chris@42 439 TV = W[11];
Chris@42 440 cr[WS(rs, 6)] = FNMS(TV, TY, TT * TU);
Chris@42 441 ci[WS(rs, 6)] = FMA(TV, TU, TT * TY);
Chris@42 442 }
Chris@42 443 {
Chris@42 444 E TA, TQ, TO, TS;
Chris@42 445 {
Chris@42 446 E To, Tz, TC, TN;
Chris@42 447 To = T5 - Ta;
Chris@42 448 Tz = Tt - Ty;
Chris@42 449 TA = To - Tz;
Chris@42 450 TQ = To + Tz;
Chris@42 451 TC = Tg - Tl;
Chris@42 452 TN = TH - TM;
Chris@42 453 TO = TC + TN;
Chris@42 454 TS = TN - TC;
Chris@42 455 }
Chris@42 456 {
Chris@42 457 E Tn, TB, TP, TR;
Chris@42 458 Tn = W[16];
Chris@42 459 TB = W[17];
Chris@42 460 cr[WS(rs, 9)] = FNMS(TB, TO, Tn * TA);
Chris@42 461 ci[WS(rs, 9)] = FMA(Tn, TO, TB * TA);
Chris@42 462 TP = W[4];
Chris@42 463 TR = W[5];
Chris@42 464 cr[WS(rs, 3)] = FNMS(TR, TS, TP * TQ);
Chris@42 465 ci[WS(rs, 3)] = FMA(TP, TS, TR * TQ);
Chris@42 466 }
Chris@42 467 }
Chris@42 468 {
Chris@42 469 E T28, T2e, T2c, T2g;
Chris@42 470 {
Chris@42 471 E T26, T27, T2a, T2b;
Chris@42 472 T26 = T1M - T1N;
Chris@42 473 T27 = T1X + T1Y;
Chris@42 474 T28 = T26 - T27;
Chris@42 475 T2e = T26 + T27;
Chris@42 476 T2a = T1U + T1V;
Chris@42 477 T2b = T1P - T1Q;
Chris@42 478 T2c = T2a + T2b;
Chris@42 479 T2g = T2a - T2b;
Chris@42 480 }
Chris@42 481 {
Chris@42 482 E T25, T29, T2d, T2f;
Chris@42 483 T25 = W[8];
Chris@42 484 T29 = W[9];
Chris@42 485 cr[WS(rs, 5)] = FNMS(T29, T2c, T25 * T28);
Chris@42 486 ci[WS(rs, 5)] = FMA(T25, T2c, T29 * T28);
Chris@42 487 T2d = W[20];
Chris@42 488 T2f = W[21];
Chris@42 489 cr[WS(rs, 11)] = FNMS(T2f, T2g, T2d * T2e);
Chris@42 490 ci[WS(rs, 11)] = FMA(T2d, T2g, T2f * T2e);
Chris@42 491 }
Chris@42 492 }
Chris@42 493 {
Chris@42 494 E T1S, T22, T20, T24;
Chris@42 495 {
Chris@42 496 E T1O, T1R, T1W, T1Z;
Chris@42 497 T1O = T1M + T1N;
Chris@42 498 T1R = T1P + T1Q;
Chris@42 499 T1S = T1O - T1R;
Chris@42 500 T22 = T1O + T1R;
Chris@42 501 T1W = T1U - T1V;
Chris@42 502 T1Z = T1X - T1Y;
Chris@42 503 T20 = T1W - T1Z;
Chris@42 504 T24 = T1W + T1Z;
Chris@42 505 }
Chris@42 506 {
Chris@42 507 E T1L, T1T, T21, T23;
Chris@42 508 T1L = W[2];
Chris@42 509 T1T = W[3];
Chris@42 510 cr[WS(rs, 2)] = FNMS(T1T, T20, T1L * T1S);
Chris@42 511 ci[WS(rs, 2)] = FMA(T1T, T1S, T1L * T20);
Chris@42 512 T21 = W[14];
Chris@42 513 T23 = W[15];
Chris@42 514 cr[WS(rs, 8)] = FNMS(T23, T24, T21 * T22);
Chris@42 515 ci[WS(rs, 8)] = FMA(T23, T22, T21 * T24);
Chris@42 516 }
Chris@42 517 }
Chris@42 518 {
Chris@42 519 E T1C, T1I, T1G, T1K;
Chris@42 520 {
Chris@42 521 E T1A, T1B, T1E, T1F;
Chris@42 522 T1A = T12 + T15;
Chris@42 523 T1B = T1p + T1s;
Chris@42 524 T1C = T1A - T1B;
Chris@42 525 T1I = T1A + T1B;
Chris@42 526 T1E = T1i + T1l;
Chris@42 527 T1F = T19 + T1c;
Chris@42 528 T1G = T1E - T1F;
Chris@42 529 T1K = T1E + T1F;
Chris@42 530 }
Chris@42 531 {
Chris@42 532 E T1z, T1D, T1H, T1J;
Chris@42 533 T1z = W[18];
Chris@42 534 T1D = W[19];
Chris@42 535 cr[WS(rs, 10)] = FNMS(T1D, T1G, T1z * T1C);
Chris@42 536 ci[WS(rs, 10)] = FMA(T1D, T1C, T1z * T1G);
Chris@42 537 T1H = W[6];
Chris@42 538 T1J = W[7];
Chris@42 539 cr[WS(rs, 4)] = FNMS(T1J, T1K, T1H * T1I);
Chris@42 540 ci[WS(rs, 4)] = FMA(T1J, T1I, T1H * T1K);
Chris@42 541 }
Chris@42 542 }
Chris@42 543 {
Chris@42 544 E T1e, T1w, T1u, T1y;
Chris@42 545 {
Chris@42 546 E T16, T1d, T1m, T1t;
Chris@42 547 T16 = T12 - T15;
Chris@42 548 T1d = T19 - T1c;
Chris@42 549 T1e = T16 - T1d;
Chris@42 550 T1w = T16 + T1d;
Chris@42 551 T1m = T1i - T1l;
Chris@42 552 T1t = T1p - T1s;
Chris@42 553 T1u = T1m + T1t;
Chris@42 554 T1y = T1m - T1t;
Chris@42 555 }
Chris@42 556 {
Chris@42 557 E TZ, T1f, T1v, T1x;
Chris@42 558 TZ = W[0];
Chris@42 559 T1f = W[1];
Chris@42 560 cr[WS(rs, 1)] = FNMS(T1f, T1u, TZ * T1e);
Chris@42 561 ci[WS(rs, 1)] = FMA(TZ, T1u, T1f * T1e);
Chris@42 562 T1v = W[12];
Chris@42 563 T1x = W[13];
Chris@42 564 cr[WS(rs, 7)] = FNMS(T1x, T1y, T1v * T1w);
Chris@42 565 ci[WS(rs, 7)] = FMA(T1v, T1y, T1x * T1w);
Chris@42 566 }
Chris@42 567 }
Chris@42 568 }
Chris@42 569 }
Chris@42 570 }
Chris@42 571
Chris@42 572 static const tw_instr twinstr[] = {
Chris@42 573 {TW_FULL, 1, 12},
Chris@42 574 {TW_NEXT, 1, 0}
Chris@42 575 };
Chris@42 576
Chris@42 577 static const hc2hc_desc desc = { 12, "hb_12", twinstr, &GENUS, {88, 30, 30, 0} };
Chris@42 578
Chris@42 579 void X(codelet_hb_12) (planner *p) {
Chris@42 580 X(khc2hc_register) (p, hb_12, &desc);
Chris@42 581 }
Chris@42 582 #endif /* HAVE_FMA */