annotate src/fftw-3.3.8/rdft/scalar/r2cf/hf_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:30 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hf_12 -include rdft/scalar/hf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 118 FP additions, 68 FP multiplications,
Chris@82 32 * (or, 72 additions, 22 multiplications, 46 fused multiply/add),
Chris@82 33 * 47 stack variables, 2 constants, and 48 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hf.h"
Chris@82 36
Chris@82 37 static void hf_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 41 {
Chris@82 42 INT m;
Chris@82 43 for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@82 44 E T1, T2i, Tl, T2e, T10, T1Y, TG, T1S, Ty, T2s, T1s, T2f, T1d, T21, T1H;
Chris@82 45 E T1Z, Te, T2p, T1l, T2h, TT, T1V, T1A, T1T;
Chris@82 46 T1 = cr[0];
Chris@82 47 T2i = ci[0];
Chris@82 48 {
Chris@82 49 E Th, Tk, Ti, T2d, Tg, Tj;
Chris@82 50 Th = cr[WS(rs, 6)];
Chris@82 51 Tk = ci[WS(rs, 6)];
Chris@82 52 Tg = W[10];
Chris@82 53 Ti = Tg * Th;
Chris@82 54 T2d = Tg * Tk;
Chris@82 55 Tj = W[11];
Chris@82 56 Tl = FMA(Tj, Tk, Ti);
Chris@82 57 T2e = FNMS(Tj, Th, T2d);
Chris@82 58 }
Chris@82 59 {
Chris@82 60 E TW, TZ, TX, T1X, TV, TY;
Chris@82 61 TW = cr[WS(rs, 9)];
Chris@82 62 TZ = ci[WS(rs, 9)];
Chris@82 63 TV = W[16];
Chris@82 64 TX = TV * TW;
Chris@82 65 T1X = TV * TZ;
Chris@82 66 TY = W[17];
Chris@82 67 T10 = FMA(TY, TZ, TX);
Chris@82 68 T1Y = FNMS(TY, TW, T1X);
Chris@82 69 }
Chris@82 70 {
Chris@82 71 E TC, TF, TD, T1R, TB, TE;
Chris@82 72 TC = cr[WS(rs, 3)];
Chris@82 73 TF = ci[WS(rs, 3)];
Chris@82 74 TB = W[4];
Chris@82 75 TD = TB * TC;
Chris@82 76 T1R = TB * TF;
Chris@82 77 TE = W[5];
Chris@82 78 TG = FMA(TE, TF, TD);
Chris@82 79 T1S = FNMS(TE, TC, T1R);
Chris@82 80 }
Chris@82 81 {
Chris@82 82 E Tn, Tq, To, T1o, Tt, Tw, Tu, T1q, Tm, Ts;
Chris@82 83 Tn = cr[WS(rs, 10)];
Chris@82 84 Tq = ci[WS(rs, 10)];
Chris@82 85 Tm = W[18];
Chris@82 86 To = Tm * Tn;
Chris@82 87 T1o = Tm * Tq;
Chris@82 88 Tt = cr[WS(rs, 2)];
Chris@82 89 Tw = ci[WS(rs, 2)];
Chris@82 90 Ts = W[2];
Chris@82 91 Tu = Ts * Tt;
Chris@82 92 T1q = Ts * Tw;
Chris@82 93 {
Chris@82 94 E Tr, T1p, Tx, T1r, Tp, Tv;
Chris@82 95 Tp = W[19];
Chris@82 96 Tr = FMA(Tp, Tq, To);
Chris@82 97 T1p = FNMS(Tp, Tn, T1o);
Chris@82 98 Tv = W[3];
Chris@82 99 Tx = FMA(Tv, Tw, Tu);
Chris@82 100 T1r = FNMS(Tv, Tt, T1q);
Chris@82 101 Ty = Tr + Tx;
Chris@82 102 T2s = Tx - Tr;
Chris@82 103 T1s = T1p - T1r;
Chris@82 104 T2f = T1p + T1r;
Chris@82 105 }
Chris@82 106 }
Chris@82 107 {
Chris@82 108 E T12, T15, T13, T1D, T18, T1b, T19, T1F, T11, T17;
Chris@82 109 T12 = cr[WS(rs, 1)];
Chris@82 110 T15 = ci[WS(rs, 1)];
Chris@82 111 T11 = W[0];
Chris@82 112 T13 = T11 * T12;
Chris@82 113 T1D = T11 * T15;
Chris@82 114 T18 = cr[WS(rs, 5)];
Chris@82 115 T1b = ci[WS(rs, 5)];
Chris@82 116 T17 = W[8];
Chris@82 117 T19 = T17 * T18;
Chris@82 118 T1F = T17 * T1b;
Chris@82 119 {
Chris@82 120 E T16, T1E, T1c, T1G, T14, T1a;
Chris@82 121 T14 = W[1];
Chris@82 122 T16 = FMA(T14, T15, T13);
Chris@82 123 T1E = FNMS(T14, T12, T1D);
Chris@82 124 T1a = W[9];
Chris@82 125 T1c = FMA(T1a, T1b, T19);
Chris@82 126 T1G = FNMS(T1a, T18, T1F);
Chris@82 127 T1d = T16 + T1c;
Chris@82 128 T21 = T1c - T16;
Chris@82 129 T1H = T1E - T1G;
Chris@82 130 T1Z = T1E + T1G;
Chris@82 131 }
Chris@82 132 }
Chris@82 133 {
Chris@82 134 E T3, T6, T4, T1h, T9, Tc, Ta, T1j, T2, T8;
Chris@82 135 T3 = cr[WS(rs, 4)];
Chris@82 136 T6 = ci[WS(rs, 4)];
Chris@82 137 T2 = W[6];
Chris@82 138 T4 = T2 * T3;
Chris@82 139 T1h = T2 * T6;
Chris@82 140 T9 = cr[WS(rs, 8)];
Chris@82 141 Tc = ci[WS(rs, 8)];
Chris@82 142 T8 = W[14];
Chris@82 143 Ta = T8 * T9;
Chris@82 144 T1j = T8 * Tc;
Chris@82 145 {
Chris@82 146 E T7, T1i, Td, T1k, T5, Tb;
Chris@82 147 T5 = W[7];
Chris@82 148 T7 = FMA(T5, T6, T4);
Chris@82 149 T1i = FNMS(T5, T3, T1h);
Chris@82 150 Tb = W[15];
Chris@82 151 Td = FMA(Tb, Tc, Ta);
Chris@82 152 T1k = FNMS(Tb, T9, T1j);
Chris@82 153 Te = T7 + Td;
Chris@82 154 T2p = Td - T7;
Chris@82 155 T1l = T1i - T1k;
Chris@82 156 T2h = T1i + T1k;
Chris@82 157 }
Chris@82 158 }
Chris@82 159 {
Chris@82 160 E TI, TL, TJ, T1w, TO, TR, TP, T1y, TH, TN;
Chris@82 161 TI = cr[WS(rs, 7)];
Chris@82 162 TL = ci[WS(rs, 7)];
Chris@82 163 TH = W[12];
Chris@82 164 TJ = TH * TI;
Chris@82 165 T1w = TH * TL;
Chris@82 166 TO = cr[WS(rs, 11)];
Chris@82 167 TR = ci[WS(rs, 11)];
Chris@82 168 TN = W[20];
Chris@82 169 TP = TN * TO;
Chris@82 170 T1y = TN * TR;
Chris@82 171 {
Chris@82 172 E TM, T1x, TS, T1z, TK, TQ;
Chris@82 173 TK = W[13];
Chris@82 174 TM = FMA(TK, TL, TJ);
Chris@82 175 T1x = FNMS(TK, TI, T1w);
Chris@82 176 TQ = W[21];
Chris@82 177 TS = FMA(TQ, TR, TP);
Chris@82 178 T1z = FNMS(TQ, TO, T1y);
Chris@82 179 TT = TM + TS;
Chris@82 180 T1V = TS - TM;
Chris@82 181 T1A = T1x - T1z;
Chris@82 182 T1T = T1x + T1z;
Chris@82 183 }
Chris@82 184 }
Chris@82 185 {
Chris@82 186 E TA, T28, T2k, T2m, T1f, T2l, T2b, T2c;
Chris@82 187 {
Chris@82 188 E Tf, Tz, T2g, T2j;
Chris@82 189 Tf = T1 + Te;
Chris@82 190 Tz = Tl + Ty;
Chris@82 191 TA = Tf + Tz;
Chris@82 192 T28 = Tf - Tz;
Chris@82 193 T2g = T2e + T2f;
Chris@82 194 T2j = T2h + T2i;
Chris@82 195 T2k = T2g + T2j;
Chris@82 196 T2m = T2j - T2g;
Chris@82 197 }
Chris@82 198 {
Chris@82 199 E TU, T1e, T29, T2a;
Chris@82 200 TU = TG + TT;
Chris@82 201 T1e = T10 + T1d;
Chris@82 202 T1f = TU + T1e;
Chris@82 203 T2l = TU - T1e;
Chris@82 204 T29 = T1S + T1T;
Chris@82 205 T2a = T1Y + T1Z;
Chris@82 206 T2b = T29 - T2a;
Chris@82 207 T2c = T29 + T2a;
Chris@82 208 }
Chris@82 209 ci[WS(rs, 5)] = TA - T1f;
Chris@82 210 cr[WS(rs, 9)] = T2l - T2m;
Chris@82 211 ci[WS(rs, 8)] = T2l + T2m;
Chris@82 212 cr[0] = TA + T1f;
Chris@82 213 cr[WS(rs, 3)] = T28 - T2b;
Chris@82 214 cr[WS(rs, 6)] = T2c - T2k;
Chris@82 215 ci[WS(rs, 11)] = T2c + T2k;
Chris@82 216 ci[WS(rs, 2)] = T28 + T2b;
Chris@82 217 }
Chris@82 218 {
Chris@82 219 E T1m, T1K, T2q, T2y, T2t, T2z, T1t, T1L, T1B, T1N, T1W, T25, T22, T26, T1I;
Chris@82 220 E T1O;
Chris@82 221 {
Chris@82 222 E T1g, T2o, T2r, T1n;
Chris@82 223 T1g = FNMS(KP500000000, Te, T1);
Chris@82 224 T1m = FNMS(KP866025403, T1l, T1g);
Chris@82 225 T1K = FMA(KP866025403, T1l, T1g);
Chris@82 226 T2o = FNMS(KP500000000, T2h, T2i);
Chris@82 227 T2q = FNMS(KP866025403, T2p, T2o);
Chris@82 228 T2y = FMA(KP866025403, T2p, T2o);
Chris@82 229 T2r = FNMS(KP500000000, T2f, T2e);
Chris@82 230 T2t = FNMS(KP866025403, T2s, T2r);
Chris@82 231 T2z = FMA(KP866025403, T2s, T2r);
Chris@82 232 T1n = FNMS(KP500000000, Ty, Tl);
Chris@82 233 T1t = FNMS(KP866025403, T1s, T1n);
Chris@82 234 T1L = FMA(KP866025403, T1s, T1n);
Chris@82 235 }
Chris@82 236 {
Chris@82 237 E T1v, T1U, T20, T1C;
Chris@82 238 T1v = FNMS(KP500000000, TT, TG);
Chris@82 239 T1B = FNMS(KP866025403, T1A, T1v);
Chris@82 240 T1N = FMA(KP866025403, T1A, T1v);
Chris@82 241 T1U = FNMS(KP500000000, T1T, T1S);
Chris@82 242 T1W = FNMS(KP866025403, T1V, T1U);
Chris@82 243 T25 = FMA(KP866025403, T1V, T1U);
Chris@82 244 T20 = FNMS(KP500000000, T1Z, T1Y);
Chris@82 245 T22 = FNMS(KP866025403, T21, T20);
Chris@82 246 T26 = FMA(KP866025403, T21, T20);
Chris@82 247 T1C = FNMS(KP500000000, T1d, T10);
Chris@82 248 T1I = FNMS(KP866025403, T1H, T1C);
Chris@82 249 T1O = FMA(KP866025403, T1H, T1C);
Chris@82 250 }
Chris@82 251 {
Chris@82 252 E T1u, T1J, T2v, T2w;
Chris@82 253 T1u = T1m + T1t;
Chris@82 254 T1J = T1B + T1I;
Chris@82 255 cr[WS(rs, 2)] = T1u - T1J;
Chris@82 256 ci[WS(rs, 3)] = T1u + T1J;
Chris@82 257 T2v = T1W + T22;
Chris@82 258 T2w = T2t + T2q;
Chris@82 259 cr[WS(rs, 8)] = -(T2v + T2w);
Chris@82 260 ci[WS(rs, 9)] = T2w - T2v;
Chris@82 261 }
Chris@82 262 {
Chris@82 263 E T2B, T2C, T2x, T2A;
Chris@82 264 T2B = T25 + T26;
Chris@82 265 T2C = T2z + T2y;
Chris@82 266 cr[WS(rs, 10)] = T2B - T2C;
Chris@82 267 ci[WS(rs, 7)] = T2B + T2C;
Chris@82 268 T2x = T1O - T1N;
Chris@82 269 T2A = T2y - T2z;
Chris@82 270 cr[WS(rs, 7)] = T2x - T2A;
Chris@82 271 ci[WS(rs, 10)] = T2x + T2A;
Chris@82 272 }
Chris@82 273 {
Chris@82 274 E T1M, T1P, T24, T27;
Chris@82 275 T1M = T1K + T1L;
Chris@82 276 T1P = T1N + T1O;
Chris@82 277 ci[WS(rs, 1)] = T1M - T1P;
Chris@82 278 cr[WS(rs, 4)] = T1M + T1P;
Chris@82 279 T24 = T1K - T1L;
Chris@82 280 T27 = T25 - T26;
Chris@82 281 ci[WS(rs, 4)] = T24 - T27;
Chris@82 282 cr[WS(rs, 1)] = T24 + T27;
Chris@82 283 }
Chris@82 284 {
Chris@82 285 E T1Q, T23, T2n, T2u;
Chris@82 286 T1Q = T1m - T1t;
Chris@82 287 T23 = T1W - T22;
Chris@82 288 ci[0] = T1Q - T23;
Chris@82 289 cr[WS(rs, 5)] = T1Q + T23;
Chris@82 290 T2n = T1I - T1B;
Chris@82 291 T2u = T2q - T2t;
Chris@82 292 cr[WS(rs, 11)] = T2n - T2u;
Chris@82 293 ci[WS(rs, 6)] = T2n + T2u;
Chris@82 294 }
Chris@82 295 }
Chris@82 296 }
Chris@82 297 }
Chris@82 298 }
Chris@82 299
Chris@82 300 static const tw_instr twinstr[] = {
Chris@82 301 {TW_FULL, 1, 12},
Chris@82 302 {TW_NEXT, 1, 0}
Chris@82 303 };
Chris@82 304
Chris@82 305 static const hc2hc_desc desc = { 12, "hf_12", twinstr, &GENUS, {72, 22, 46, 0} };
Chris@82 306
Chris@82 307 void X(codelet_hf_12) (planner *p) {
Chris@82 308 X(khc2hc_register) (p, hf_12, &desc);
Chris@82 309 }
Chris@82 310 #else
Chris@82 311
Chris@82 312 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hf_12 -include rdft/scalar/hf.h */
Chris@82 313
Chris@82 314 /*
Chris@82 315 * This function contains 118 FP additions, 60 FP multiplications,
Chris@82 316 * (or, 88 additions, 30 multiplications, 30 fused multiply/add),
Chris@82 317 * 47 stack variables, 2 constants, and 48 memory accesses
Chris@82 318 */
Chris@82 319 #include "rdft/scalar/hf.h"
Chris@82 320
Chris@82 321 static void hf_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 322 {
Chris@82 323 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 324 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 325 {
Chris@82 326 INT m;
Chris@82 327 for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@82 328 E T1, T1W, T18, T23, Tc, T15, T1V, T22, TR, T1E, T1o, T1D, T12, T1l, T1F;
Chris@82 329 E T1G, Ti, T1S, T1d, T26, Tt, T1a, T1T, T25, TA, T1y, T1j, T1B, TL, T1g;
Chris@82 330 E T1z, T1A;
Chris@82 331 {
Chris@82 332 E T6, T16, Tb, T17;
Chris@82 333 T1 = cr[0];
Chris@82 334 T1W = ci[0];
Chris@82 335 {
Chris@82 336 E T3, T5, T2, T4;
Chris@82 337 T3 = cr[WS(rs, 4)];
Chris@82 338 T5 = ci[WS(rs, 4)];
Chris@82 339 T2 = W[6];
Chris@82 340 T4 = W[7];
Chris@82 341 T6 = FMA(T2, T3, T4 * T5);
Chris@82 342 T16 = FNMS(T4, T3, T2 * T5);
Chris@82 343 }
Chris@82 344 {
Chris@82 345 E T8, Ta, T7, T9;
Chris@82 346 T8 = cr[WS(rs, 8)];
Chris@82 347 Ta = ci[WS(rs, 8)];
Chris@82 348 T7 = W[14];
Chris@82 349 T9 = W[15];
Chris@82 350 Tb = FMA(T7, T8, T9 * Ta);
Chris@82 351 T17 = FNMS(T9, T8, T7 * Ta);
Chris@82 352 }
Chris@82 353 T18 = KP866025403 * (T16 - T17);
Chris@82 354 T23 = KP866025403 * (Tb - T6);
Chris@82 355 Tc = T6 + Tb;
Chris@82 356 T15 = FNMS(KP500000000, Tc, T1);
Chris@82 357 T1V = T16 + T17;
Chris@82 358 T22 = FNMS(KP500000000, T1V, T1W);
Chris@82 359 }
Chris@82 360 {
Chris@82 361 E T11, T1n, TW, T1m;
Chris@82 362 {
Chris@82 363 E TO, TQ, TN, TP;
Chris@82 364 TO = cr[WS(rs, 9)];
Chris@82 365 TQ = ci[WS(rs, 9)];
Chris@82 366 TN = W[16];
Chris@82 367 TP = W[17];
Chris@82 368 TR = FMA(TN, TO, TP * TQ);
Chris@82 369 T1E = FNMS(TP, TO, TN * TQ);
Chris@82 370 }
Chris@82 371 {
Chris@82 372 E TY, T10, TX, TZ;
Chris@82 373 TY = cr[WS(rs, 5)];
Chris@82 374 T10 = ci[WS(rs, 5)];
Chris@82 375 TX = W[8];
Chris@82 376 TZ = W[9];
Chris@82 377 T11 = FMA(TX, TY, TZ * T10);
Chris@82 378 T1n = FNMS(TZ, TY, TX * T10);
Chris@82 379 }
Chris@82 380 {
Chris@82 381 E TT, TV, TS, TU;
Chris@82 382 TT = cr[WS(rs, 1)];
Chris@82 383 TV = ci[WS(rs, 1)];
Chris@82 384 TS = W[0];
Chris@82 385 TU = W[1];
Chris@82 386 TW = FMA(TS, TT, TU * TV);
Chris@82 387 T1m = FNMS(TU, TT, TS * TV);
Chris@82 388 }
Chris@82 389 T1o = KP866025403 * (T1m - T1n);
Chris@82 390 T1D = KP866025403 * (T11 - TW);
Chris@82 391 T12 = TW + T11;
Chris@82 392 T1l = FNMS(KP500000000, T12, TR);
Chris@82 393 T1F = T1m + T1n;
Chris@82 394 T1G = FNMS(KP500000000, T1F, T1E);
Chris@82 395 }
Chris@82 396 {
Chris@82 397 E Ts, T1c, Tn, T1b;
Chris@82 398 {
Chris@82 399 E Tf, Th, Te, Tg;
Chris@82 400 Tf = cr[WS(rs, 6)];
Chris@82 401 Th = ci[WS(rs, 6)];
Chris@82 402 Te = W[10];
Chris@82 403 Tg = W[11];
Chris@82 404 Ti = FMA(Te, Tf, Tg * Th);
Chris@82 405 T1S = FNMS(Tg, Tf, Te * Th);
Chris@82 406 }
Chris@82 407 {
Chris@82 408 E Tp, Tr, To, Tq;
Chris@82 409 Tp = cr[WS(rs, 2)];
Chris@82 410 Tr = ci[WS(rs, 2)];
Chris@82 411 To = W[2];
Chris@82 412 Tq = W[3];
Chris@82 413 Ts = FMA(To, Tp, Tq * Tr);
Chris@82 414 T1c = FNMS(Tq, Tp, To * Tr);
Chris@82 415 }
Chris@82 416 {
Chris@82 417 E Tk, Tm, Tj, Tl;
Chris@82 418 Tk = cr[WS(rs, 10)];
Chris@82 419 Tm = ci[WS(rs, 10)];
Chris@82 420 Tj = W[18];
Chris@82 421 Tl = W[19];
Chris@82 422 Tn = FMA(Tj, Tk, Tl * Tm);
Chris@82 423 T1b = FNMS(Tl, Tk, Tj * Tm);
Chris@82 424 }
Chris@82 425 T1d = KP866025403 * (T1b - T1c);
Chris@82 426 T26 = KP866025403 * (Ts - Tn);
Chris@82 427 Tt = Tn + Ts;
Chris@82 428 T1a = FNMS(KP500000000, Tt, Ti);
Chris@82 429 T1T = T1b + T1c;
Chris@82 430 T25 = FNMS(KP500000000, T1T, T1S);
Chris@82 431 }
Chris@82 432 {
Chris@82 433 E TK, T1i, TF, T1h;
Chris@82 434 {
Chris@82 435 E Tx, Tz, Tw, Ty;
Chris@82 436 Tx = cr[WS(rs, 3)];
Chris@82 437 Tz = ci[WS(rs, 3)];
Chris@82 438 Tw = W[4];
Chris@82 439 Ty = W[5];
Chris@82 440 TA = FMA(Tw, Tx, Ty * Tz);
Chris@82 441 T1y = FNMS(Ty, Tx, Tw * Tz);
Chris@82 442 }
Chris@82 443 {
Chris@82 444 E TH, TJ, TG, TI;
Chris@82 445 TH = cr[WS(rs, 11)];
Chris@82 446 TJ = ci[WS(rs, 11)];
Chris@82 447 TG = W[20];
Chris@82 448 TI = W[21];
Chris@82 449 TK = FMA(TG, TH, TI * TJ);
Chris@82 450 T1i = FNMS(TI, TH, TG * TJ);
Chris@82 451 }
Chris@82 452 {
Chris@82 453 E TC, TE, TB, TD;
Chris@82 454 TC = cr[WS(rs, 7)];
Chris@82 455 TE = ci[WS(rs, 7)];
Chris@82 456 TB = W[12];
Chris@82 457 TD = W[13];
Chris@82 458 TF = FMA(TB, TC, TD * TE);
Chris@82 459 T1h = FNMS(TD, TC, TB * TE);
Chris@82 460 }
Chris@82 461 T1j = KP866025403 * (T1h - T1i);
Chris@82 462 T1B = KP866025403 * (TK - TF);
Chris@82 463 TL = TF + TK;
Chris@82 464 T1g = FNMS(KP500000000, TL, TA);
Chris@82 465 T1z = T1h + T1i;
Chris@82 466 T1A = FNMS(KP500000000, T1z, T1y);
Chris@82 467 }
Chris@82 468 {
Chris@82 469 E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
Chris@82 470 {
Chris@82 471 E Td, Tu, T1U, T1X;
Chris@82 472 Td = T1 + Tc;
Chris@82 473 Tu = Ti + Tt;
Chris@82 474 Tv = Td + Tu;
Chris@82 475 T1N = Td - Tu;
Chris@82 476 T1U = T1S + T1T;
Chris@82 477 T1X = T1V + T1W;
Chris@82 478 T1Y = T1U + T1X;
Chris@82 479 T20 = T1X - T1U;
Chris@82 480 }
Chris@82 481 {
Chris@82 482 E TM, T13, T1O, T1P;
Chris@82 483 TM = TA + TL;
Chris@82 484 T13 = TR + T12;
Chris@82 485 T14 = TM + T13;
Chris@82 486 T1Z = TM - T13;
Chris@82 487 T1O = T1y + T1z;
Chris@82 488 T1P = T1E + T1F;
Chris@82 489 T1Q = T1O - T1P;
Chris@82 490 T1R = T1O + T1P;
Chris@82 491 }
Chris@82 492 ci[WS(rs, 5)] = Tv - T14;
Chris@82 493 cr[WS(rs, 9)] = T1Z - T20;
Chris@82 494 ci[WS(rs, 8)] = T1Z + T20;
Chris@82 495 cr[0] = Tv + T14;
Chris@82 496 cr[WS(rs, 3)] = T1N - T1Q;
Chris@82 497 cr[WS(rs, 6)] = T1R - T1Y;
Chris@82 498 ci[WS(rs, 11)] = T1R + T1Y;
Chris@82 499 ci[WS(rs, 2)] = T1N + T1Q;
Chris@82 500 }
Chris@82 501 {
Chris@82 502 E T1f, T1x, T28, T2a, T1q, T21, T1I, T29;
Chris@82 503 {
Chris@82 504 E T19, T1e, T24, T27;
Chris@82 505 T19 = T15 - T18;
Chris@82 506 T1e = T1a - T1d;
Chris@82 507 T1f = T19 + T1e;
Chris@82 508 T1x = T19 - T1e;
Chris@82 509 T24 = T22 - T23;
Chris@82 510 T27 = T25 - T26;
Chris@82 511 T28 = T24 - T27;
Chris@82 512 T2a = T27 + T24;
Chris@82 513 }
Chris@82 514 {
Chris@82 515 E T1k, T1p, T1C, T1H;
Chris@82 516 T1k = T1g - T1j;
Chris@82 517 T1p = T1l - T1o;
Chris@82 518 T1q = T1k + T1p;
Chris@82 519 T21 = T1p - T1k;
Chris@82 520 T1C = T1A - T1B;
Chris@82 521 T1H = T1D - T1G;
Chris@82 522 T1I = T1C + T1H;
Chris@82 523 T29 = T1H - T1C;
Chris@82 524 }
Chris@82 525 cr[WS(rs, 2)] = T1f - T1q;
Chris@82 526 cr[WS(rs, 8)] = T29 - T2a;
Chris@82 527 ci[WS(rs, 9)] = T29 + T2a;
Chris@82 528 ci[WS(rs, 3)] = T1f + T1q;
Chris@82 529 ci[0] = T1x - T1I;
Chris@82 530 cr[WS(rs, 11)] = T21 - T28;
Chris@82 531 ci[WS(rs, 6)] = T21 + T28;
Chris@82 532 cr[WS(rs, 5)] = T1x + T1I;
Chris@82 533 }
Chris@82 534 {
Chris@82 535 E T1t, T1J, T2e, T2g, T1w, T2b, T1M, T2f;
Chris@82 536 {
Chris@82 537 E T1r, T1s, T2c, T2d;
Chris@82 538 T1r = T15 + T18;
Chris@82 539 T1s = T1a + T1d;
Chris@82 540 T1t = T1r + T1s;
Chris@82 541 T1J = T1r - T1s;
Chris@82 542 T2c = T23 + T22;
Chris@82 543 T2d = T26 + T25;
Chris@82 544 T2e = T2c - T2d;
Chris@82 545 T2g = T2d + T2c;
Chris@82 546 }
Chris@82 547 {
Chris@82 548 E T1u, T1v, T1K, T1L;
Chris@82 549 T1u = T1g + T1j;
Chris@82 550 T1v = T1l + T1o;
Chris@82 551 T1w = T1u + T1v;
Chris@82 552 T2b = T1v - T1u;
Chris@82 553 T1K = T1B + T1A;
Chris@82 554 T1L = T1D + T1G;
Chris@82 555 T1M = T1K - T1L;
Chris@82 556 T2f = T1K + T1L;
Chris@82 557 }
Chris@82 558 ci[WS(rs, 1)] = T1t - T1w;
Chris@82 559 cr[WS(rs, 1)] = T1J + T1M;
Chris@82 560 cr[WS(rs, 4)] = T1t + T1w;
Chris@82 561 ci[WS(rs, 4)] = T1J - T1M;
Chris@82 562 cr[WS(rs, 7)] = T2b - T2e;
Chris@82 563 ci[WS(rs, 7)] = T2f + T2g;
Chris@82 564 ci[WS(rs, 10)] = T2b + T2e;
Chris@82 565 cr[WS(rs, 10)] = T2f - T2g;
Chris@82 566 }
Chris@82 567 }
Chris@82 568 }
Chris@82 569 }
Chris@82 570
Chris@82 571 static const tw_instr twinstr[] = {
Chris@82 572 {TW_FULL, 1, 12},
Chris@82 573 {TW_NEXT, 1, 0}
Chris@82 574 };
Chris@82 575
Chris@82 576 static const hc2hc_desc desc = { 12, "hf_12", twinstr, &GENUS, {88, 30, 30, 0} };
Chris@82 577
Chris@82 578 void X(codelet_hf_12) (planner *p) {
Chris@82 579 X(khc2hc_register) (p, hf_12, &desc);
Chris@82 580 }
Chris@82 581 #endif