annotate src/fftw-3.3.5/rdft/scalar/r2cf/hc2cfdft_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:48:41 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cfdft_10 -include hc2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 122 FP additions, 92 FP multiplications,
Chris@42 32 * (or, 68 additions, 38 multiplications, 54 fused multiply/add),
Chris@42 33 * 94 stack variables, 5 constants, and 40 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cf.h"
Chris@42 36
Chris@42 37 static void hc2cfdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 43 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 44 {
Chris@42 45 INT m;
Chris@42 46 for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 47 E T1x, T1I, T1T, T22, T20;
Chris@42 48 {
Chris@42 49 E T3, T1u, T1S, T2f, Td, T1w, T14, T1p, T1j, T1q, T1N, T2e, T1z, To, T2i;
Chris@42 50 E T1H, TQ, T1n, Ty, T1B;
Chris@42 51 {
Chris@42 52 E T1h, TW, Tc, T1b, T1g, T1f, T1Q, TV, T7, TS, T1J, TU, Ts, T19, T18;
Chris@42 53 E T15, Tx, T17, T1O, T1A, Tt, TD, Ti, TE, Tn, TA, T1F, TC, T1y, Tj;
Chris@42 54 E T11, T12, TJ, TZ, TO, TY, TG, T1L, T1e, T1, T2;
Chris@42 55 T1 = Ip[0];
Chris@42 56 T2 = Im[0];
Chris@42 57 {
Chris@42 58 E Ta, Tb, T1c, T1d;
Chris@42 59 Ta = Rp[WS(rs, 2)];
Chris@42 60 Tb = Rm[WS(rs, 2)];
Chris@42 61 T1c = Rm[0];
Chris@42 62 T1h = T1 + T2;
Chris@42 63 T3 = T1 - T2;
Chris@42 64 T1d = Rp[0];
Chris@42 65 TW = Ta + Tb;
Chris@42 66 Tc = Ta - Tb;
Chris@42 67 T1b = W[0];
Chris@42 68 T1u = T1d + T1c;
Chris@42 69 T1e = T1c - T1d;
Chris@42 70 T1g = W[1];
Chris@42 71 }
Chris@42 72 {
Chris@42 73 E T16, Tp, TT, T5, T6, TB, Tf;
Chris@42 74 T5 = Ip[WS(rs, 2)];
Chris@42 75 T6 = Im[WS(rs, 2)];
Chris@42 76 T1f = T1b * T1e;
Chris@42 77 T1Q = T1g * T1e;
Chris@42 78 TV = W[7];
Chris@42 79 T7 = T5 + T6;
Chris@42 80 TT = T5 - T6;
Chris@42 81 TS = W[6];
Chris@42 82 {
Chris@42 83 E Tv, Tw, Tq, Tr;
Chris@42 84 Tq = Rm[WS(rs, 3)];
Chris@42 85 Tr = Rp[WS(rs, 3)];
Chris@42 86 T1J = TV * TT;
Chris@42 87 TU = TS * TT;
Chris@42 88 Tv = Ip[WS(rs, 3)];
Chris@42 89 Ts = Tq - Tr;
Chris@42 90 T19 = Tr + Tq;
Chris@42 91 Tw = Im[WS(rs, 3)];
Chris@42 92 T18 = W[11];
Chris@42 93 T15 = W[10];
Chris@42 94 Tx = Tv + Tw;
Chris@42 95 T16 = Tv - Tw;
Chris@42 96 Tp = W[12];
Chris@42 97 }
Chris@42 98 {
Chris@42 99 E Tg, Th, Tl, Tm;
Chris@42 100 Tg = Ip[WS(rs, 1)];
Chris@42 101 T17 = T15 * T16;
Chris@42 102 T1O = T18 * T16;
Chris@42 103 T1A = Tp * Tx;
Chris@42 104 Tt = Tp * Ts;
Chris@42 105 Th = Im[WS(rs, 1)];
Chris@42 106 Tl = Rp[WS(rs, 1)];
Chris@42 107 Tm = Rm[WS(rs, 1)];
Chris@42 108 TD = W[5];
Chris@42 109 Ti = Tg - Th;
Chris@42 110 TE = Tg + Th;
Chris@42 111 Tn = Tl + Tm;
Chris@42 112 TB = Tm - Tl;
Chris@42 113 TA = W[4];
Chris@42 114 Tf = W[2];
Chris@42 115 T1F = TD * TB;
Chris@42 116 }
Chris@42 117 {
Chris@42 118 E TH, TI, TM, TN;
Chris@42 119 TH = Ip[WS(rs, 4)];
Chris@42 120 TC = TA * TB;
Chris@42 121 T1y = Tf * Tn;
Chris@42 122 Tj = Tf * Ti;
Chris@42 123 TI = Im[WS(rs, 4)];
Chris@42 124 TM = Rp[WS(rs, 4)];
Chris@42 125 TN = Rm[WS(rs, 4)];
Chris@42 126 T11 = W[17];
Chris@42 127 T12 = TH + TI;
Chris@42 128 TJ = TH - TI;
Chris@42 129 TZ = TN - TM;
Chris@42 130 TO = TM + TN;
Chris@42 131 TY = W[16];
Chris@42 132 TG = W[14];
Chris@42 133 T1L = T11 * TZ;
Chris@42 134 }
Chris@42 135 }
Chris@42 136 {
Chris@42 137 E T10, T1D, TK, T4, T9, T1P, T1R, T8, T1v;
Chris@42 138 T10 = TY * TZ;
Chris@42 139 T1D = TG * TO;
Chris@42 140 TK = TG * TJ;
Chris@42 141 T4 = W[9];
Chris@42 142 T9 = W[8];
Chris@42 143 T1P = FMA(T15, T19, T1O);
Chris@42 144 T1R = FMA(T1b, T1h, T1Q);
Chris@42 145 T8 = T4 * T7;
Chris@42 146 T1v = T9 * T7;
Chris@42 147 {
Chris@42 148 E TX, T13, T1a, T1i;
Chris@42 149 TX = FNMS(TV, TW, TU);
Chris@42 150 T1S = T1P - T1R;
Chris@42 151 T2f = T1P + T1R;
Chris@42 152 Td = FMA(T9, Tc, T8);
Chris@42 153 T1w = FNMS(T4, Tc, T1v);
Chris@42 154 T13 = FNMS(T11, T12, T10);
Chris@42 155 T1a = FNMS(T18, T19, T17);
Chris@42 156 T1i = FNMS(T1g, T1h, T1f);
Chris@42 157 {
Chris@42 158 E T1K, T1M, TF, T1G, TL;
Chris@42 159 T1K = FMA(TS, TW, T1J);
Chris@42 160 T14 = TX + T13;
Chris@42 161 T1p = T13 - TX;
Chris@42 162 T1j = T1a + T1i;
Chris@42 163 T1q = T1i - T1a;
Chris@42 164 T1M = FMA(TY, T12, T1L);
Chris@42 165 TF = FNMS(TD, TE, TC);
Chris@42 166 T1G = FMA(TA, TE, T1F);
Chris@42 167 TL = W[15];
Chris@42 168 T1N = T1K - T1M;
Chris@42 169 T2e = T1K + T1M;
Chris@42 170 {
Chris@42 171 E Tk, T1E, TP, Tu;
Chris@42 172 Tk = W[3];
Chris@42 173 T1E = FMA(TL, TJ, T1D);
Chris@42 174 TP = FNMS(TL, TO, TK);
Chris@42 175 Tu = W[13];
Chris@42 176 T1z = FMA(Tk, Ti, T1y);
Chris@42 177 To = FNMS(Tk, Tn, Tj);
Chris@42 178 T2i = T1G + T1E;
Chris@42 179 T1H = T1E - T1G;
Chris@42 180 TQ = TF + TP;
Chris@42 181 T1n = TF - TP;
Chris@42 182 Ty = FNMS(Tu, Tx, Tt);
Chris@42 183 T1B = FMA(Tu, Ts, T1A);
Chris@42 184 }
Chris@42 185 }
Chris@42 186 }
Chris@42 187 }
Chris@42 188 }
Chris@42 189 {
Chris@42 190 E T2p, T1t, T1m, T1C, T2o, T2m, T2k, T2w, T2y, T2n, T2d, T2l;
Chris@42 191 {
Chris@42 192 E T2g, Te, T2h, T2u, T1k, TR, T2v, Tz;
Chris@42 193 T2p = T2e + T2f;
Chris@42 194 T2g = T2e - T2f;
Chris@42 195 Te = T3 - Td;
Chris@42 196 T1t = Td + T3;
Chris@42 197 Tz = To + Ty;
Chris@42 198 T1m = Ty - To;
Chris@42 199 T2h = T1z + T1B;
Chris@42 200 T1C = T1z - T1B;
Chris@42 201 T2u = T14 - T1j;
Chris@42 202 T1k = T14 + T1j;
Chris@42 203 TR = Tz + TQ;
Chris@42 204 T2v = Tz - TQ;
Chris@42 205 {
Chris@42 206 E T2c, T2b, T2j, T1l;
Chris@42 207 T2j = T2h - T2i;
Chris@42 208 T2o = T2h + T2i;
Chris@42 209 T2c = TR - T1k;
Chris@42 210 T1l = TR + T1k;
Chris@42 211 T2m = FMA(KP618033988, T2g, T2j);
Chris@42 212 T2k = FNMS(KP618033988, T2j, T2g);
Chris@42 213 T2w = FNMS(KP618033988, T2v, T2u);
Chris@42 214 T2y = FMA(KP618033988, T2u, T2v);
Chris@42 215 Ip[0] = KP500000000 * (Te + T1l);
Chris@42 216 T2b = FNMS(KP250000000, T1l, Te);
Chris@42 217 T2n = T1u + T1w;
Chris@42 218 T1x = T1u - T1w;
Chris@42 219 T2d = FNMS(KP559016994, T2c, T2b);
Chris@42 220 T2l = FMA(KP559016994, T2c, T2b);
Chris@42 221 }
Chris@42 222 }
Chris@42 223 {
Chris@42 224 E T1o, T1Y, T28, T2a, T1Z, T1r, T2t, T2x;
Chris@42 225 {
Chris@42 226 E T26, T2s, T2q, T27, T2r;
Chris@42 227 T1I = T1C + T1H;
Chris@42 228 T26 = T1H - T1C;
Chris@42 229 Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP951056516, T2k, T2d)));
Chris@42 230 Ip[WS(rs, 2)] = KP500000000 * (FMA(KP951056516, T2k, T2d));
Chris@42 231 Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP951056516, T2m, T2l)));
Chris@42 232 Ip[WS(rs, 4)] = KP500000000 * (FMA(KP951056516, T2m, T2l));
Chris@42 233 T2s = T2o - T2p;
Chris@42 234 T2q = T2o + T2p;
Chris@42 235 T27 = T1S - T1N;
Chris@42 236 T1T = T1N + T1S;
Chris@42 237 T1o = T1m + T1n;
Chris@42 238 T1Y = T1n - T1m;
Chris@42 239 Rp[0] = KP500000000 * (T2n + T2q);
Chris@42 240 T2r = FNMS(KP250000000, T2q, T2n);
Chris@42 241 T28 = FMA(KP618033988, T27, T26);
Chris@42 242 T2a = FNMS(KP618033988, T26, T27);
Chris@42 243 T1Z = T1q - T1p;
Chris@42 244 T1r = T1p + T1q;
Chris@42 245 T2t = FNMS(KP559016994, T2s, T2r);
Chris@42 246 T2x = FMA(KP559016994, T2s, T2r);
Chris@42 247 }
Chris@42 248 {
Chris@42 249 E T24, T23, T1s, T25, T29;
Chris@42 250 T1s = T1o + T1r;
Chris@42 251 T24 = T1r - T1o;
Chris@42 252 Rm[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T2w, T2t));
Chris@42 253 Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T2w, T2t));
Chris@42 254 Rm[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T2y, T2x));
Chris@42 255 Rp[WS(rs, 4)] = KP500000000 * (FNMS(KP951056516, T2y, T2x));
Chris@42 256 Im[WS(rs, 4)] = KP500000000 * (T1s - T1t);
Chris@42 257 T23 = FMA(KP250000000, T1s, T1t);
Chris@42 258 T25 = FMA(KP559016994, T24, T23);
Chris@42 259 T29 = FNMS(KP559016994, T24, T23);
Chris@42 260 T22 = FNMS(KP618033988, T1Y, T1Z);
Chris@42 261 T20 = FMA(KP618033988, T1Z, T1Y);
Chris@42 262 Im[0] = -(KP500000000 * (FNMS(KP951056516, T28, T25)));
Chris@42 263 Ip[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T28, T25));
Chris@42 264 Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP951056516, T2a, T29)));
Chris@42 265 Ip[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T2a, T29));
Chris@42 266 }
Chris@42 267 }
Chris@42 268 }
Chris@42 269 }
Chris@42 270 {
Chris@42 271 E T1U, T1W, T1V, T21, T1X;
Chris@42 272 T1U = T1I + T1T;
Chris@42 273 T1W = T1I - T1T;
Chris@42 274 Rm[WS(rs, 4)] = KP500000000 * (T1x + T1U);
Chris@42 275 T1V = FNMS(KP250000000, T1U, T1x);
Chris@42 276 T21 = FNMS(KP559016994, T1W, T1V);
Chris@42 277 T1X = FMA(KP559016994, T1W, T1V);
Chris@42 278 Rm[0] = KP500000000 * (FNMS(KP951056516, T20, T1X));
Chris@42 279 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T20, T1X));
Chris@42 280 Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T22, T21));
Chris@42 281 Rp[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T22, T21));
Chris@42 282 }
Chris@42 283 }
Chris@42 284 }
Chris@42 285 }
Chris@42 286
Chris@42 287 static const tw_instr twinstr[] = {
Chris@42 288 {TW_FULL, 1, 10},
Chris@42 289 {TW_NEXT, 1, 0}
Chris@42 290 };
Chris@42 291
Chris@42 292 static const hc2c_desc desc = { 10, "hc2cfdft_10", twinstr, &GENUS, {68, 38, 54, 0} };
Chris@42 293
Chris@42 294 void X(codelet_hc2cfdft_10) (planner *p) {
Chris@42 295 X(khc2c_register) (p, hc2cfdft_10, &desc, HC2C_VIA_DFT);
Chris@42 296 }
Chris@42 297 #else /* HAVE_FMA */
Chris@42 298
Chris@42 299 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cfdft_10 -include hc2cf.h */
Chris@42 300
Chris@42 301 /*
Chris@42 302 * This function contains 122 FP additions, 68 FP multiplications,
Chris@42 303 * (or, 92 additions, 38 multiplications, 30 fused multiply/add),
Chris@42 304 * 62 stack variables, 5 constants, and 40 memory accesses
Chris@42 305 */
Chris@42 306 #include "hc2cf.h"
Chris@42 307
Chris@42 308 static void hc2cfdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 309 {
Chris@42 310 DK(KP293892626, +0.293892626146236564584352977319536384298826219);
Chris@42 311 DK(KP475528258, +0.475528258147576786058219666689691071702849317);
Chris@42 312 DK(KP125000000, +0.125000000000000000000000000000000000000000000);
Chris@42 313 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 314 DK(KP279508497, +0.279508497187473712051146708591409529430077295);
Chris@42 315 {
Chris@42 316 INT m;
Chris@42 317 for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 318 E Tw, TL, TM, T1W, T1X, T27, T1Z, T20, T26, TX, T1a, T1b, T1d, T1e, T1f;
Chris@42 319 E T1q, T1t, T1u, T1x, T1A, T1B, T1g, T1h, T1i, Td, T25, T1k, T1F;
Chris@42 320 {
Chris@42 321 E T3, T1D, T19, T1z, T7, Tb, TR, T1v, Tm, T1o, TK, T1s, Tv, T1p, T12;
Chris@42 322 E T1y, TF, T1r, TW, T1w;
Chris@42 323 {
Chris@42 324 E T1, T2, T18, T14, T15, T16, T13, T17;
Chris@42 325 T1 = Ip[0];
Chris@42 326 T2 = Im[0];
Chris@42 327 T18 = T1 + T2;
Chris@42 328 T14 = Rm[0];
Chris@42 329 T15 = Rp[0];
Chris@42 330 T16 = T14 - T15;
Chris@42 331 T3 = T1 - T2;
Chris@42 332 T1D = T15 + T14;
Chris@42 333 T13 = W[0];
Chris@42 334 T17 = W[1];
Chris@42 335 T19 = FNMS(T17, T18, T13 * T16);
Chris@42 336 T1z = FMA(T17, T16, T13 * T18);
Chris@42 337 }
Chris@42 338 {
Chris@42 339 E T5, T6, TO, T9, Ta, TQ, TN, TP;
Chris@42 340 T5 = Ip[WS(rs, 2)];
Chris@42 341 T6 = Im[WS(rs, 2)];
Chris@42 342 TO = T5 - T6;
Chris@42 343 T9 = Rp[WS(rs, 2)];
Chris@42 344 Ta = Rm[WS(rs, 2)];
Chris@42 345 TQ = T9 + Ta;
Chris@42 346 T7 = T5 + T6;
Chris@42 347 Tb = T9 - Ta;
Chris@42 348 TN = W[6];
Chris@42 349 TP = W[7];
Chris@42 350 TR = FNMS(TP, TQ, TN * TO);
Chris@42 351 T1v = FMA(TP, TO, TN * TQ);
Chris@42 352 }
Chris@42 353 {
Chris@42 354 E Th, TJ, Tl, TH;
Chris@42 355 {
Chris@42 356 E Tf, Tg, Tj, Tk;
Chris@42 357 Tf = Ip[WS(rs, 1)];
Chris@42 358 Tg = Im[WS(rs, 1)];
Chris@42 359 Th = Tf - Tg;
Chris@42 360 TJ = Tf + Tg;
Chris@42 361 Tj = Rp[WS(rs, 1)];
Chris@42 362 Tk = Rm[WS(rs, 1)];
Chris@42 363 Tl = Tj + Tk;
Chris@42 364 TH = Tj - Tk;
Chris@42 365 }
Chris@42 366 {
Chris@42 367 E Te, Ti, TG, TI;
Chris@42 368 Te = W[2];
Chris@42 369 Ti = W[3];
Chris@42 370 Tm = FNMS(Ti, Tl, Te * Th);
Chris@42 371 T1o = FMA(Te, Tl, Ti * Th);
Chris@42 372 TG = W[4];
Chris@42 373 TI = W[5];
Chris@42 374 TK = FMA(TG, TH, TI * TJ);
Chris@42 375 T1s = FNMS(TI, TH, TG * TJ);
Chris@42 376 }
Chris@42 377 }
Chris@42 378 {
Chris@42 379 E Tq, TZ, Tu, T11;
Chris@42 380 {
Chris@42 381 E To, Tp, Ts, Tt;
Chris@42 382 To = Ip[WS(rs, 3)];
Chris@42 383 Tp = Im[WS(rs, 3)];
Chris@42 384 Tq = To + Tp;
Chris@42 385 TZ = To - Tp;
Chris@42 386 Ts = Rp[WS(rs, 3)];
Chris@42 387 Tt = Rm[WS(rs, 3)];
Chris@42 388 Tu = Ts - Tt;
Chris@42 389 T11 = Ts + Tt;
Chris@42 390 }
Chris@42 391 {
Chris@42 392 E Tn, Tr, TY, T10;
Chris@42 393 Tn = W[13];
Chris@42 394 Tr = W[12];
Chris@42 395 Tv = FMA(Tn, Tq, Tr * Tu);
Chris@42 396 T1p = FNMS(Tn, Tu, Tr * Tq);
Chris@42 397 TY = W[10];
Chris@42 398 T10 = W[11];
Chris@42 399 T12 = FNMS(T10, T11, TY * TZ);
Chris@42 400 T1y = FMA(T10, TZ, TY * T11);
Chris@42 401 }
Chris@42 402 }
Chris@42 403 {
Chris@42 404 E TA, TV, TE, TT;
Chris@42 405 {
Chris@42 406 E Ty, Tz, TC, TD;
Chris@42 407 Ty = Ip[WS(rs, 4)];
Chris@42 408 Tz = Im[WS(rs, 4)];
Chris@42 409 TA = Ty - Tz;
Chris@42 410 TV = Ty + Tz;
Chris@42 411 TC = Rp[WS(rs, 4)];
Chris@42 412 TD = Rm[WS(rs, 4)];
Chris@42 413 TE = TC + TD;
Chris@42 414 TT = TC - TD;
Chris@42 415 }
Chris@42 416 {
Chris@42 417 E Tx, TB, TS, TU;
Chris@42 418 Tx = W[14];
Chris@42 419 TB = W[15];
Chris@42 420 TF = FNMS(TB, TE, Tx * TA);
Chris@42 421 T1r = FMA(Tx, TE, TB * TA);
Chris@42 422 TS = W[16];
Chris@42 423 TU = W[17];
Chris@42 424 TW = FMA(TS, TT, TU * TV);
Chris@42 425 T1w = FNMS(TU, TT, TS * TV);
Chris@42 426 }
Chris@42 427 }
Chris@42 428 Tw = Tm - Tv;
Chris@42 429 TL = TF - TK;
Chris@42 430 TM = Tw + TL;
Chris@42 431 T1W = T1v + T1w;
Chris@42 432 T1X = T1y + T1z;
Chris@42 433 T27 = T1W + T1X;
Chris@42 434 T1Z = T1o + T1p;
Chris@42 435 T20 = T1s + T1r;
Chris@42 436 T26 = T1Z + T20;
Chris@42 437 TX = TR - TW;
Chris@42 438 T1a = T12 + T19;
Chris@42 439 T1b = TX + T1a;
Chris@42 440 T1d = T19 - T12;
Chris@42 441 T1e = TR + TW;
Chris@42 442 T1f = T1d - T1e;
Chris@42 443 T1q = T1o - T1p;
Chris@42 444 T1t = T1r - T1s;
Chris@42 445 T1u = T1q + T1t;
Chris@42 446 T1x = T1v - T1w;
Chris@42 447 T1A = T1y - T1z;
Chris@42 448 T1B = T1x + T1A;
Chris@42 449 T1g = Tm + Tv;
Chris@42 450 T1h = TK + TF;
Chris@42 451 T1i = T1g + T1h;
Chris@42 452 {
Chris@42 453 E Tc, T1E, T4, T8;
Chris@42 454 T4 = W[9];
Chris@42 455 T8 = W[8];
Chris@42 456 Tc = FMA(T4, T7, T8 * Tb);
Chris@42 457 T1E = FNMS(T4, Tb, T8 * T7);
Chris@42 458 Td = T3 - Tc;
Chris@42 459 T25 = T1D + T1E;
Chris@42 460 T1k = Tc + T3;
Chris@42 461 T1F = T1D - T1E;
Chris@42 462 }
Chris@42 463 }
Chris@42 464 {
Chris@42 465 E T1U, T1c, T1T, T22, T24, T1Y, T21, T23, T1V;
Chris@42 466 T1U = KP279508497 * (TM - T1b);
Chris@42 467 T1c = TM + T1b;
Chris@42 468 T1T = FNMS(KP125000000, T1c, KP500000000 * Td);
Chris@42 469 T1Y = T1W - T1X;
Chris@42 470 T21 = T1Z - T20;
Chris@42 471 T22 = FNMS(KP293892626, T21, KP475528258 * T1Y);
Chris@42 472 T24 = FMA(KP475528258, T21, KP293892626 * T1Y);
Chris@42 473 Ip[0] = KP500000000 * (Td + T1c);
Chris@42 474 T23 = T1U + T1T;
Chris@42 475 Ip[WS(rs, 4)] = T23 + T24;
Chris@42 476 Im[WS(rs, 3)] = T24 - T23;
Chris@42 477 T1V = T1T - T1U;
Chris@42 478 Ip[WS(rs, 2)] = T1V + T22;
Chris@42 479 Im[WS(rs, 1)] = T22 - T1V;
Chris@42 480 }
Chris@42 481 {
Chris@42 482 E T2a, T28, T29, T2e, T2g, T2c, T2d, T2f, T2b;
Chris@42 483 T2a = KP279508497 * (T26 - T27);
Chris@42 484 T28 = T26 + T27;
Chris@42 485 T29 = FNMS(KP125000000, T28, KP500000000 * T25);
Chris@42 486 T2c = TX - T1a;
Chris@42 487 T2d = Tw - TL;
Chris@42 488 T2e = FNMS(KP293892626, T2d, KP475528258 * T2c);
Chris@42 489 T2g = FMA(KP475528258, T2d, KP293892626 * T2c);
Chris@42 490 Rp[0] = KP500000000 * (T25 + T28);
Chris@42 491 T2f = T2a + T29;
Chris@42 492 Rp[WS(rs, 4)] = T2f - T2g;
Chris@42 493 Rm[WS(rs, 3)] = T2g + T2f;
Chris@42 494 T2b = T29 - T2a;
Chris@42 495 Rp[WS(rs, 2)] = T2b - T2e;
Chris@42 496 Rm[WS(rs, 1)] = T2e + T2b;
Chris@42 497 }
Chris@42 498 {
Chris@42 499 E T1M, T1j, T1L, T1Q, T1S, T1O, T1P, T1R, T1N;
Chris@42 500 T1M = KP279508497 * (T1i + T1f);
Chris@42 501 T1j = T1f - T1i;
Chris@42 502 T1L = FMA(KP500000000, T1k, KP125000000 * T1j);
Chris@42 503 T1O = T1A - T1x;
Chris@42 504 T1P = T1q - T1t;
Chris@42 505 T1Q = FNMS(KP475528258, T1P, KP293892626 * T1O);
Chris@42 506 T1S = FMA(KP293892626, T1P, KP475528258 * T1O);
Chris@42 507 Im[WS(rs, 4)] = KP500000000 * (T1j - T1k);
Chris@42 508 T1R = T1L - T1M;
Chris@42 509 Ip[WS(rs, 3)] = T1R + T1S;
Chris@42 510 Im[WS(rs, 2)] = T1S - T1R;
Chris@42 511 T1N = T1L + T1M;
Chris@42 512 Ip[WS(rs, 1)] = T1N + T1Q;
Chris@42 513 Im[0] = T1Q - T1N;
Chris@42 514 }
Chris@42 515 {
Chris@42 516 E T1C, T1G, T1H, T1n, T1J, T1l, T1m, T1K, T1I;
Chris@42 517 T1C = KP279508497 * (T1u - T1B);
Chris@42 518 T1G = T1u + T1B;
Chris@42 519 T1H = FNMS(KP125000000, T1G, KP500000000 * T1F);
Chris@42 520 T1l = T1g - T1h;
Chris@42 521 T1m = T1e + T1d;
Chris@42 522 T1n = FMA(KP475528258, T1l, KP293892626 * T1m);
Chris@42 523 T1J = FNMS(KP293892626, T1l, KP475528258 * T1m);
Chris@42 524 Rm[WS(rs, 4)] = KP500000000 * (T1F + T1G);
Chris@42 525 T1K = T1H - T1C;
Chris@42 526 Rp[WS(rs, 3)] = T1J + T1K;
Chris@42 527 Rm[WS(rs, 2)] = T1K - T1J;
Chris@42 528 T1I = T1C + T1H;
Chris@42 529 Rp[WS(rs, 1)] = T1n + T1I;
Chris@42 530 Rm[0] = T1I - T1n;
Chris@42 531 }
Chris@42 532 }
Chris@42 533 }
Chris@42 534 }
Chris@42 535
Chris@42 536 static const tw_instr twinstr[] = {
Chris@42 537 {TW_FULL, 1, 10},
Chris@42 538 {TW_NEXT, 1, 0}
Chris@42 539 };
Chris@42 540
Chris@42 541 static const hc2c_desc desc = { 10, "hc2cfdft_10", twinstr, &GENUS, {92, 38, 30, 0} };
Chris@42 542
Chris@42 543 void X(codelet_hc2cfdft_10) (planner *p) {
Chris@42 544 X(khc2c_register) (p, hc2cfdft_10, &desc, HC2C_VIA_DFT);
Chris@42 545 }
Chris@42 546 #endif /* HAVE_FMA */