annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cb_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:51:30 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cb_12 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 118 FP additions, 68 FP multiplications,
Chris@42 32 * (or, 72 additions, 22 multiplications, 46 fused multiply/add),
Chris@42 33 * 64 stack variables, 2 constants, and 48 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cb_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 41 {
Chris@42 42 INT m;
Chris@42 43 for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
Chris@42 44 E T1U, T1X, T1W, T1Y, T1V;
Chris@42 45 {
Chris@42 46 E T18, T20, T21, T1b, T2a, T1s, T29, T1p, TO, T11, To, Tb, Tg, T23, T1f;
Chris@42 47 E Tl, Ty, Tt, T1i, T24, T1z, T2d, T1w, T2c;
Chris@42 48 {
Chris@42 49 E T5, Ta, TN, TI;
Chris@42 50 {
Chris@42 51 E T1, TE, T6, TM, T7, T1o, T4, T17, TH, T8, TJ, TK;
Chris@42 52 T1 = Rp[0];
Chris@42 53 TE = Ip[0];
Chris@42 54 T6 = Rm[WS(rs, 5)];
Chris@42 55 TM = Im[WS(rs, 5)];
Chris@42 56 {
Chris@42 57 E T2, T3, TF, TG;
Chris@42 58 T2 = Rp[WS(rs, 4)];
Chris@42 59 T3 = Rm[WS(rs, 3)];
Chris@42 60 TF = Ip[WS(rs, 4)];
Chris@42 61 TG = Im[WS(rs, 3)];
Chris@42 62 T7 = Rm[WS(rs, 1)];
Chris@42 63 T1o = T2 - T3;
Chris@42 64 T4 = T2 + T3;
Chris@42 65 T17 = TF + TG;
Chris@42 66 TH = TF - TG;
Chris@42 67 T8 = Rp[WS(rs, 2)];
Chris@42 68 TJ = Ip[WS(rs, 2)];
Chris@42 69 TK = Im[WS(rs, 1)];
Chris@42 70 }
Chris@42 71 {
Chris@42 72 E T1r, T1a, T19, T1q, T9, TL, T16, T1n;
Chris@42 73 T5 = T1 + T4;
Chris@42 74 T16 = FNMS(KP500000000, T4, T1);
Chris@42 75 T1r = T7 - T8;
Chris@42 76 T9 = T7 + T8;
Chris@42 77 T1a = TJ + TK;
Chris@42 78 TL = TJ - TK;
Chris@42 79 T18 = FNMS(KP866025403, T17, T16);
Chris@42 80 T20 = FMA(KP866025403, T17, T16);
Chris@42 81 T19 = FNMS(KP500000000, T9, T6);
Chris@42 82 Ta = T6 + T9;
Chris@42 83 TN = TL - TM;
Chris@42 84 T1q = FMA(KP500000000, TL, TM);
Chris@42 85 T1n = FNMS(KP500000000, TH, TE);
Chris@42 86 TI = TE + TH;
Chris@42 87 T21 = FNMS(KP866025403, T1a, T19);
Chris@42 88 T1b = FMA(KP866025403, T1a, T19);
Chris@42 89 T2a = FMA(KP866025403, T1r, T1q);
Chris@42 90 T1s = FNMS(KP866025403, T1r, T1q);
Chris@42 91 T29 = FNMS(KP866025403, T1o, T1n);
Chris@42 92 T1p = FMA(KP866025403, T1o, T1n);
Chris@42 93 }
Chris@42 94 }
Chris@42 95 {
Chris@42 96 E Tc, Tp, Th, Tx, Ti, Tf, T1v, Ts, T1e, Tj, Tu, Tv;
Chris@42 97 Tc = Rp[WS(rs, 3)];
Chris@42 98 TO = TI - TN;
Chris@42 99 T11 = TI + TN;
Chris@42 100 Tp = Ip[WS(rs, 3)];
Chris@42 101 To = T5 - Ta;
Chris@42 102 Tb = T5 + Ta;
Chris@42 103 Th = Rm[WS(rs, 2)];
Chris@42 104 Tx = Im[WS(rs, 2)];
Chris@42 105 {
Chris@42 106 E Td, Te, Tq, Tr;
Chris@42 107 Td = Rm[WS(rs, 4)];
Chris@42 108 Te = Rm[0];
Chris@42 109 Tq = Im[WS(rs, 4)];
Chris@42 110 Tr = Im[0];
Chris@42 111 Ti = Rp[WS(rs, 1)];
Chris@42 112 Tf = Td + Te;
Chris@42 113 T1v = Td - Te;
Chris@42 114 Ts = Tq + Tr;
Chris@42 115 T1e = Tq - Tr;
Chris@42 116 Tj = Rp[WS(rs, 5)];
Chris@42 117 Tu = Ip[WS(rs, 1)];
Chris@42 118 Tv = Ip[WS(rs, 5)];
Chris@42 119 }
Chris@42 120 {
Chris@42 121 E T1y, T1h, T1g, T1x, Tk, Tw, T1d, T1u;
Chris@42 122 T1d = FNMS(KP500000000, Tf, Tc);
Chris@42 123 Tg = Tc + Tf;
Chris@42 124 Tk = Ti + Tj;
Chris@42 125 T1y = Ti - Tj;
Chris@42 126 Tw = Tu + Tv;
Chris@42 127 T1h = Tv - Tu;
Chris@42 128 T23 = FNMS(KP866025403, T1e, T1d);
Chris@42 129 T1f = FMA(KP866025403, T1e, T1d);
Chris@42 130 Tl = Th + Tk;
Chris@42 131 T1g = FNMS(KP500000000, Tk, Th);
Chris@42 132 T1x = FMA(KP500000000, Tw, Tx);
Chris@42 133 Ty = Tw - Tx;
Chris@42 134 Tt = Tp - Ts;
Chris@42 135 T1u = FMA(KP500000000, Ts, Tp);
Chris@42 136 T1i = FMA(KP866025403, T1h, T1g);
Chris@42 137 T24 = FNMS(KP866025403, T1h, T1g);
Chris@42 138 T1z = FNMS(KP866025403, T1y, T1x);
Chris@42 139 T2d = FMA(KP866025403, T1y, T1x);
Chris@42 140 T1w = FMA(KP866025403, T1v, T1u);
Chris@42 141 T2c = FNMS(KP866025403, T1v, T1u);
Chris@42 142 }
Chris@42 143 }
Chris@42 144 }
Chris@42 145 {
Chris@42 146 E TY, T13, TX, T10;
Chris@42 147 {
Chris@42 148 E Tn, T12, TC, Tm, TD, TS, TA, Tz;
Chris@42 149 Tn = W[16];
Chris@42 150 T12 = Tt + Ty;
Chris@42 151 Tz = Tt - Ty;
Chris@42 152 TC = W[17];
Chris@42 153 Tm = Tg + Tl;
Chris@42 154 TD = Tg - Tl;
Chris@42 155 TS = To + Tz;
Chris@42 156 TA = To - Tz;
Chris@42 157 {
Chris@42 158 E TV, TU, TW, TT;
Chris@42 159 {
Chris@42 160 E TQ, TR, TP, TB;
Chris@42 161 TV = TO - TD;
Chris@42 162 TP = TD + TO;
Chris@42 163 Rp[0] = Tb + Tm;
Chris@42 164 TB = Tn * TA;
Chris@42 165 TQ = Tn * TP;
Chris@42 166 TR = W[4];
Chris@42 167 Ip[WS(rs, 4)] = FNMS(TC, TP, TB);
Chris@42 168 TU = W[5];
Chris@42 169 Im[WS(rs, 4)] = FMA(TC, TA, TQ);
Chris@42 170 TW = TR * TV;
Chris@42 171 TT = TR * TS;
Chris@42 172 }
Chris@42 173 Im[WS(rs, 1)] = FMA(TU, TS, TW);
Chris@42 174 Ip[WS(rs, 1)] = FNMS(TU, TV, TT);
Chris@42 175 TY = Tb - Tm;
Chris@42 176 T13 = T11 - T12;
Chris@42 177 TX = W[10];
Chris@42 178 T10 = W[11];
Chris@42 179 Rm[0] = T11 + T12;
Chris@42 180 }
Chris@42 181 }
Chris@42 182 {
Chris@42 183 E T1K, T1Q, T1P, T1L, T2o, T2u, T2t, T2p;
Chris@42 184 {
Chris@42 185 E T1E, T1D, T1H, T1F, T1G, T1t, T1k, T1A;
Chris@42 186 {
Chris@42 187 E T1c, TZ, T14, T1j;
Chris@42 188 T1K = T18 - T1b;
Chris@42 189 T1c = T18 + T1b;
Chris@42 190 TZ = TX * TY;
Chris@42 191 T14 = T10 * TY;
Chris@42 192 T1j = T1f + T1i;
Chris@42 193 T1Q = T1f - T1i;
Chris@42 194 T1P = T1p + T1s;
Chris@42 195 T1t = T1p - T1s;
Chris@42 196 Rp[WS(rs, 3)] = FNMS(T10, T13, TZ);
Chris@42 197 Rm[WS(rs, 3)] = FMA(TX, T13, T14);
Chris@42 198 T1E = T1c + T1j;
Chris@42 199 T1k = T1c - T1j;
Chris@42 200 T1A = T1w - T1z;
Chris@42 201 T1L = T1w + T1z;
Chris@42 202 }
Chris@42 203 {
Chris@42 204 E T15, T1m, T1B, T1l, T1C;
Chris@42 205 T15 = W[18];
Chris@42 206 T1m = W[19];
Chris@42 207 T1D = W[6];
Chris@42 208 T1H = T1t + T1A;
Chris@42 209 T1B = T1t - T1A;
Chris@42 210 T1l = T15 * T1k;
Chris@42 211 T1C = T1m * T1k;
Chris@42 212 T1F = T1D * T1E;
Chris@42 213 T1G = W[7];
Chris@42 214 Rp[WS(rs, 5)] = FNMS(T1m, T1B, T1l);
Chris@42 215 Rm[WS(rs, 5)] = FMA(T15, T1B, T1C);
Chris@42 216 }
Chris@42 217 {
Chris@42 218 E T26, T2i, T2l, T2f, T1Z, T28;
Chris@42 219 {
Chris@42 220 E T22, T1I, T25, T2b, T2e;
Chris@42 221 T22 = T20 + T21;
Chris@42 222 T2o = T20 - T21;
Chris@42 223 Rp[WS(rs, 2)] = FNMS(T1G, T1H, T1F);
Chris@42 224 T1I = T1G * T1E;
Chris@42 225 T2u = T23 - T24;
Chris@42 226 T25 = T23 + T24;
Chris@42 227 T2b = T29 - T2a;
Chris@42 228 T2t = T29 + T2a;
Chris@42 229 T2p = T2c + T2d;
Chris@42 230 T2e = T2c - T2d;
Chris@42 231 Rm[WS(rs, 2)] = FMA(T1D, T1H, T1I);
Chris@42 232 T26 = T22 - T25;
Chris@42 233 T2i = T22 + T25;
Chris@42 234 T2l = T2b + T2e;
Chris@42 235 T2f = T2b - T2e;
Chris@42 236 }
Chris@42 237 T1Z = W[2];
Chris@42 238 T28 = W[3];
Chris@42 239 {
Chris@42 240 E T2h, T2k, T27, T2g, T2j, T2m;
Chris@42 241 T2h = W[14];
Chris@42 242 T2k = W[15];
Chris@42 243 T27 = T1Z * T26;
Chris@42 244 T2g = T28 * T26;
Chris@42 245 T2j = T2h * T2i;
Chris@42 246 T2m = T2k * T2i;
Chris@42 247 Rp[WS(rs, 1)] = FNMS(T28, T2f, T27);
Chris@42 248 Rm[WS(rs, 1)] = FMA(T1Z, T2f, T2g);
Chris@42 249 Rp[WS(rs, 4)] = FNMS(T2k, T2l, T2j);
Chris@42 250 Rm[WS(rs, 4)] = FMA(T2h, T2l, T2m);
Chris@42 251 }
Chris@42 252 }
Chris@42 253 }
Chris@42 254 {
Chris@42 255 E T2y, T2B, T2A, T2C, T2z;
Chris@42 256 {
Chris@42 257 E T2n, T2q, T2v, T2s, T2r, T2x, T2w;
Chris@42 258 T2n = W[8];
Chris@42 259 T2y = T2o + T2p;
Chris@42 260 T2q = T2o - T2p;
Chris@42 261 T2B = T2t - T2u;
Chris@42 262 T2v = T2t + T2u;
Chris@42 263 T2s = W[9];
Chris@42 264 T2r = T2n * T2q;
Chris@42 265 T2x = W[20];
Chris@42 266 T2w = T2n * T2v;
Chris@42 267 T2A = W[21];
Chris@42 268 Ip[WS(rs, 2)] = FNMS(T2s, T2v, T2r);
Chris@42 269 T2C = T2x * T2B;
Chris@42 270 T2z = T2x * T2y;
Chris@42 271 Im[WS(rs, 2)] = FMA(T2s, T2q, T2w);
Chris@42 272 }
Chris@42 273 Im[WS(rs, 5)] = FMA(T2A, T2y, T2C);
Chris@42 274 Ip[WS(rs, 5)] = FNMS(T2A, T2B, T2z);
Chris@42 275 {
Chris@42 276 E T1J, T1M, T1R, T1O, T1N, T1T, T1S;
Chris@42 277 T1J = W[0];
Chris@42 278 T1U = T1K + T1L;
Chris@42 279 T1M = T1K - T1L;
Chris@42 280 T1X = T1P - T1Q;
Chris@42 281 T1R = T1P + T1Q;
Chris@42 282 T1O = W[1];
Chris@42 283 T1N = T1J * T1M;
Chris@42 284 T1T = W[12];
Chris@42 285 T1S = T1J * T1R;
Chris@42 286 T1W = W[13];
Chris@42 287 Ip[0] = FNMS(T1O, T1R, T1N);
Chris@42 288 T1Y = T1T * T1X;
Chris@42 289 T1V = T1T * T1U;
Chris@42 290 Im[0] = FMA(T1O, T1M, T1S);
Chris@42 291 }
Chris@42 292 }
Chris@42 293 }
Chris@42 294 }
Chris@42 295 }
Chris@42 296 Im[WS(rs, 3)] = FMA(T1W, T1U, T1Y);
Chris@42 297 Ip[WS(rs, 3)] = FNMS(T1W, T1X, T1V);
Chris@42 298 }
Chris@42 299 }
Chris@42 300 }
Chris@42 301
Chris@42 302 static const tw_instr twinstr[] = {
Chris@42 303 {TW_FULL, 1, 12},
Chris@42 304 {TW_NEXT, 1, 0}
Chris@42 305 };
Chris@42 306
Chris@42 307 static const hc2c_desc desc = { 12, "hc2cb_12", twinstr, &GENUS, {72, 22, 46, 0} };
Chris@42 308
Chris@42 309 void X(codelet_hc2cb_12) (planner *p) {
Chris@42 310 X(khc2c_register) (p, hc2cb_12, &desc, HC2C_VIA_RDFT);
Chris@42 311 }
Chris@42 312 #else /* HAVE_FMA */
Chris@42 313
Chris@42 314 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cb_12 -include hc2cb.h */
Chris@42 315
Chris@42 316 /*
Chris@42 317 * This function contains 118 FP additions, 60 FP multiplications,
Chris@42 318 * (or, 88 additions, 30 multiplications, 30 fused multiply/add),
Chris@42 319 * 39 stack variables, 2 constants, and 48 memory accesses
Chris@42 320 */
Chris@42 321 #include "hc2cb.h"
Chris@42 322
Chris@42 323 static void hc2cb_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 324 {
Chris@42 325 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 326 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 327 {
Chris@42 328 INT m;
Chris@42 329 for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
Chris@42 330 E T5, TH, T12, T1M, T1i, T1U, Tl, Ty, T1c, T1Y, T1s, T1Q, Ta, TM, T15;
Chris@42 331 E T1N, T1l, T1V, Tg, Tt, T19, T1X, T1p, T1P;
Chris@42 332 {
Chris@42 333 E T1, TD, T4, T1g, TG, T11, T10, T1h;
Chris@42 334 T1 = Rp[0];
Chris@42 335 TD = Ip[0];
Chris@42 336 {
Chris@42 337 E T2, T3, TE, TF;
Chris@42 338 T2 = Rp[WS(rs, 4)];
Chris@42 339 T3 = Rm[WS(rs, 3)];
Chris@42 340 T4 = T2 + T3;
Chris@42 341 T1g = KP866025403 * (T2 - T3);
Chris@42 342 TE = Ip[WS(rs, 4)];
Chris@42 343 TF = Im[WS(rs, 3)];
Chris@42 344 TG = TE - TF;
Chris@42 345 T11 = KP866025403 * (TE + TF);
Chris@42 346 }
Chris@42 347 T5 = T1 + T4;
Chris@42 348 TH = TD + TG;
Chris@42 349 T10 = FNMS(KP500000000, T4, T1);
Chris@42 350 T12 = T10 - T11;
Chris@42 351 T1M = T10 + T11;
Chris@42 352 T1h = FNMS(KP500000000, TG, TD);
Chris@42 353 T1i = T1g + T1h;
Chris@42 354 T1U = T1h - T1g;
Chris@42 355 }
Chris@42 356 {
Chris@42 357 E Th, Tx, Tk, T1a, Tw, T1r, T1b, T1q;
Chris@42 358 Th = Rm[WS(rs, 2)];
Chris@42 359 Tx = Im[WS(rs, 2)];
Chris@42 360 {
Chris@42 361 E Ti, Tj, Tu, Tv;
Chris@42 362 Ti = Rp[WS(rs, 1)];
Chris@42 363 Tj = Rp[WS(rs, 5)];
Chris@42 364 Tk = Ti + Tj;
Chris@42 365 T1a = KP866025403 * (Ti - Tj);
Chris@42 366 Tu = Ip[WS(rs, 1)];
Chris@42 367 Tv = Ip[WS(rs, 5)];
Chris@42 368 Tw = Tu + Tv;
Chris@42 369 T1r = KP866025403 * (Tv - Tu);
Chris@42 370 }
Chris@42 371 Tl = Th + Tk;
Chris@42 372 Ty = Tw - Tx;
Chris@42 373 T1b = FMA(KP500000000, Tw, Tx);
Chris@42 374 T1c = T1a - T1b;
Chris@42 375 T1Y = T1a + T1b;
Chris@42 376 T1q = FNMS(KP500000000, Tk, Th);
Chris@42 377 T1s = T1q + T1r;
Chris@42 378 T1Q = T1q - T1r;
Chris@42 379 }
Chris@42 380 {
Chris@42 381 E T6, TL, T9, T1j, TK, T14, T13, T1k;
Chris@42 382 T6 = Rm[WS(rs, 5)];
Chris@42 383 TL = Im[WS(rs, 5)];
Chris@42 384 {
Chris@42 385 E T7, T8, TI, TJ;
Chris@42 386 T7 = Rm[WS(rs, 1)];
Chris@42 387 T8 = Rp[WS(rs, 2)];
Chris@42 388 T9 = T7 + T8;
Chris@42 389 T1j = KP866025403 * (T7 - T8);
Chris@42 390 TI = Ip[WS(rs, 2)];
Chris@42 391 TJ = Im[WS(rs, 1)];
Chris@42 392 TK = TI - TJ;
Chris@42 393 T14 = KP866025403 * (TI + TJ);
Chris@42 394 }
Chris@42 395 Ta = T6 + T9;
Chris@42 396 TM = TK - TL;
Chris@42 397 T13 = FNMS(KP500000000, T9, T6);
Chris@42 398 T15 = T13 + T14;
Chris@42 399 T1N = T13 - T14;
Chris@42 400 T1k = FMA(KP500000000, TK, TL);
Chris@42 401 T1l = T1j - T1k;
Chris@42 402 T1V = T1j + T1k;
Chris@42 403 }
Chris@42 404 {
Chris@42 405 E Tc, Tp, Tf, T17, Ts, T1o, T18, T1n;
Chris@42 406 Tc = Rp[WS(rs, 3)];
Chris@42 407 Tp = Ip[WS(rs, 3)];
Chris@42 408 {
Chris@42 409 E Td, Te, Tq, Tr;
Chris@42 410 Td = Rm[WS(rs, 4)];
Chris@42 411 Te = Rm[0];
Chris@42 412 Tf = Td + Te;
Chris@42 413 T17 = KP866025403 * (Td - Te);
Chris@42 414 Tq = Im[WS(rs, 4)];
Chris@42 415 Tr = Im[0];
Chris@42 416 Ts = Tq + Tr;
Chris@42 417 T1o = KP866025403 * (Tq - Tr);
Chris@42 418 }
Chris@42 419 Tg = Tc + Tf;
Chris@42 420 Tt = Tp - Ts;
Chris@42 421 T18 = FMA(KP500000000, Ts, Tp);
Chris@42 422 T19 = T17 + T18;
Chris@42 423 T1X = T18 - T17;
Chris@42 424 T1n = FNMS(KP500000000, Tf, Tc);
Chris@42 425 T1p = T1n + T1o;
Chris@42 426 T1P = T1n - T1o;
Chris@42 427 }
Chris@42 428 {
Chris@42 429 E Tb, Tm, TU, TW, TX, TY, TT, TV;
Chris@42 430 Tb = T5 + Ta;
Chris@42 431 Tm = Tg + Tl;
Chris@42 432 TU = Tb - Tm;
Chris@42 433 TW = TH + TM;
Chris@42 434 TX = Tt + Ty;
Chris@42 435 TY = TW - TX;
Chris@42 436 Rp[0] = Tb + Tm;
Chris@42 437 Rm[0] = TW + TX;
Chris@42 438 TT = W[10];
Chris@42 439 TV = W[11];
Chris@42 440 Rp[WS(rs, 3)] = FNMS(TV, TY, TT * TU);
Chris@42 441 Rm[WS(rs, 3)] = FMA(TV, TU, TT * TY);
Chris@42 442 }
Chris@42 443 {
Chris@42 444 E TA, TQ, TO, TS;
Chris@42 445 {
Chris@42 446 E To, Tz, TC, TN;
Chris@42 447 To = T5 - Ta;
Chris@42 448 Tz = Tt - Ty;
Chris@42 449 TA = To - Tz;
Chris@42 450 TQ = To + Tz;
Chris@42 451 TC = Tg - Tl;
Chris@42 452 TN = TH - TM;
Chris@42 453 TO = TC + TN;
Chris@42 454 TS = TN - TC;
Chris@42 455 }
Chris@42 456 {
Chris@42 457 E Tn, TB, TP, TR;
Chris@42 458 Tn = W[16];
Chris@42 459 TB = W[17];
Chris@42 460 Ip[WS(rs, 4)] = FNMS(TB, TO, Tn * TA);
Chris@42 461 Im[WS(rs, 4)] = FMA(Tn, TO, TB * TA);
Chris@42 462 TP = W[4];
Chris@42 463 TR = W[5];
Chris@42 464 Ip[WS(rs, 1)] = FNMS(TR, TS, TP * TQ);
Chris@42 465 Im[WS(rs, 1)] = FMA(TP, TS, TR * TQ);
Chris@42 466 }
Chris@42 467 }
Chris@42 468 {
Chris@42 469 E T28, T2e, T2c, T2g;
Chris@42 470 {
Chris@42 471 E T26, T27, T2a, T2b;
Chris@42 472 T26 = T1M - T1N;
Chris@42 473 T27 = T1X + T1Y;
Chris@42 474 T28 = T26 - T27;
Chris@42 475 T2e = T26 + T27;
Chris@42 476 T2a = T1U + T1V;
Chris@42 477 T2b = T1P - T1Q;
Chris@42 478 T2c = T2a + T2b;
Chris@42 479 T2g = T2a - T2b;
Chris@42 480 }
Chris@42 481 {
Chris@42 482 E T25, T29, T2d, T2f;
Chris@42 483 T25 = W[8];
Chris@42 484 T29 = W[9];
Chris@42 485 Ip[WS(rs, 2)] = FNMS(T29, T2c, T25 * T28);
Chris@42 486 Im[WS(rs, 2)] = FMA(T25, T2c, T29 * T28);
Chris@42 487 T2d = W[20];
Chris@42 488 T2f = W[21];
Chris@42 489 Ip[WS(rs, 5)] = FNMS(T2f, T2g, T2d * T2e);
Chris@42 490 Im[WS(rs, 5)] = FMA(T2d, T2g, T2f * T2e);
Chris@42 491 }
Chris@42 492 }
Chris@42 493 {
Chris@42 494 E T1S, T22, T20, T24;
Chris@42 495 {
Chris@42 496 E T1O, T1R, T1W, T1Z;
Chris@42 497 T1O = T1M + T1N;
Chris@42 498 T1R = T1P + T1Q;
Chris@42 499 T1S = T1O - T1R;
Chris@42 500 T22 = T1O + T1R;
Chris@42 501 T1W = T1U - T1V;
Chris@42 502 T1Z = T1X - T1Y;
Chris@42 503 T20 = T1W - T1Z;
Chris@42 504 T24 = T1W + T1Z;
Chris@42 505 }
Chris@42 506 {
Chris@42 507 E T1L, T1T, T21, T23;
Chris@42 508 T1L = W[2];
Chris@42 509 T1T = W[3];
Chris@42 510 Rp[WS(rs, 1)] = FNMS(T1T, T20, T1L * T1S);
Chris@42 511 Rm[WS(rs, 1)] = FMA(T1T, T1S, T1L * T20);
Chris@42 512 T21 = W[14];
Chris@42 513 T23 = W[15];
Chris@42 514 Rp[WS(rs, 4)] = FNMS(T23, T24, T21 * T22);
Chris@42 515 Rm[WS(rs, 4)] = FMA(T23, T22, T21 * T24);
Chris@42 516 }
Chris@42 517 }
Chris@42 518 {
Chris@42 519 E T1C, T1I, T1G, T1K;
Chris@42 520 {
Chris@42 521 E T1A, T1B, T1E, T1F;
Chris@42 522 T1A = T12 + T15;
Chris@42 523 T1B = T1p + T1s;
Chris@42 524 T1C = T1A - T1B;
Chris@42 525 T1I = T1A + T1B;
Chris@42 526 T1E = T1i + T1l;
Chris@42 527 T1F = T19 + T1c;
Chris@42 528 T1G = T1E - T1F;
Chris@42 529 T1K = T1E + T1F;
Chris@42 530 }
Chris@42 531 {
Chris@42 532 E T1z, T1D, T1H, T1J;
Chris@42 533 T1z = W[18];
Chris@42 534 T1D = W[19];
Chris@42 535 Rp[WS(rs, 5)] = FNMS(T1D, T1G, T1z * T1C);
Chris@42 536 Rm[WS(rs, 5)] = FMA(T1D, T1C, T1z * T1G);
Chris@42 537 T1H = W[6];
Chris@42 538 T1J = W[7];
Chris@42 539 Rp[WS(rs, 2)] = FNMS(T1J, T1K, T1H * T1I);
Chris@42 540 Rm[WS(rs, 2)] = FMA(T1J, T1I, T1H * T1K);
Chris@42 541 }
Chris@42 542 }
Chris@42 543 {
Chris@42 544 E T1e, T1w, T1u, T1y;
Chris@42 545 {
Chris@42 546 E T16, T1d, T1m, T1t;
Chris@42 547 T16 = T12 - T15;
Chris@42 548 T1d = T19 - T1c;
Chris@42 549 T1e = T16 - T1d;
Chris@42 550 T1w = T16 + T1d;
Chris@42 551 T1m = T1i - T1l;
Chris@42 552 T1t = T1p - T1s;
Chris@42 553 T1u = T1m + T1t;
Chris@42 554 T1y = T1m - T1t;
Chris@42 555 }
Chris@42 556 {
Chris@42 557 E TZ, T1f, T1v, T1x;
Chris@42 558 TZ = W[0];
Chris@42 559 T1f = W[1];
Chris@42 560 Ip[0] = FNMS(T1f, T1u, TZ * T1e);
Chris@42 561 Im[0] = FMA(TZ, T1u, T1f * T1e);
Chris@42 562 T1v = W[12];
Chris@42 563 T1x = W[13];
Chris@42 564 Ip[WS(rs, 3)] = FNMS(T1x, T1y, T1v * T1w);
Chris@42 565 Im[WS(rs, 3)] = FMA(T1v, T1y, T1x * T1w);
Chris@42 566 }
Chris@42 567 }
Chris@42 568 }
Chris@42 569 }
Chris@42 570 }
Chris@42 571
Chris@42 572 static const tw_instr twinstr[] = {
Chris@42 573 {TW_FULL, 1, 12},
Chris@42 574 {TW_NEXT, 1, 0}
Chris@42 575 };
Chris@42 576
Chris@42 577 static const hc2c_desc desc = { 12, "hc2cb_12", twinstr, &GENUS, {88, 30, 30, 0} };
Chris@42 578
Chris@42 579 void X(codelet_hc2cb_12) (planner *p) {
Chris@42 580 X(khc2c_register) (p, hc2cb_12, &desc, HC2C_VIA_RDFT);
Chris@42 581 }
Chris@42 582 #endif /* HAVE_FMA */