annotate src/fftw-3.3.5/dft/scalar/codelets/t1_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:36:13 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include t.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 102 FP additions, 72 FP multiplications,
Chris@42 32 * (or, 48 additions, 18 multiplications, 54 fused multiply/add),
Chris@42 33 * 70 stack variables, 4 constants, and 40 memory accesses
Chris@42 34 */
Chris@42 35 #include "t.h"
Chris@42 36
Chris@42 37 static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@42 46 E T1X, T21, T20, T22;
Chris@42 47 {
Chris@42 48 E T23, T1U, T8, T12, T1y, T25, T1P, T1H, T1Y, T18, T10, T2b, T1K, T1O, T15;
Chris@42 49 E T1Z, T2a, Tz, T24, T1n;
Chris@42 50 {
Chris@42 51 E T1, T1T, T3, T6, T2, T5;
Chris@42 52 T1 = ri[0];
Chris@42 53 T1T = ii[0];
Chris@42 54 T3 = ri[WS(rs, 5)];
Chris@42 55 T6 = ii[WS(rs, 5)];
Chris@42 56 T2 = W[8];
Chris@42 57 T5 = W[9];
Chris@42 58 {
Chris@42 59 E T1w, TY, T1s, T1F, TM, T16, T1u, TS;
Chris@42 60 {
Chris@42 61 E TF, T1p, TO, TR, T1r, TL, TN, TQ, T1t, TP;
Chris@42 62 {
Chris@42 63 E TU, TX, TT, TW;
Chris@42 64 {
Chris@42 65 E TB, TE, T1R, T4, TA, TD;
Chris@42 66 TB = ri[WS(rs, 4)];
Chris@42 67 TE = ii[WS(rs, 4)];
Chris@42 68 T1R = T2 * T6;
Chris@42 69 T4 = T2 * T3;
Chris@42 70 TA = W[6];
Chris@42 71 TD = W[7];
Chris@42 72 {
Chris@42 73 E T1S, T7, T1o, TC;
Chris@42 74 T1S = FNMS(T5, T3, T1R);
Chris@42 75 T7 = FMA(T5, T6, T4);
Chris@42 76 T1o = TA * TE;
Chris@42 77 TC = TA * TB;
Chris@42 78 T23 = T1T - T1S;
Chris@42 79 T1U = T1S + T1T;
Chris@42 80 T8 = T1 - T7;
Chris@42 81 T12 = T1 + T7;
Chris@42 82 TF = FMA(TD, TE, TC);
Chris@42 83 T1p = FNMS(TD, TB, T1o);
Chris@42 84 }
Chris@42 85 }
Chris@42 86 TU = ri[WS(rs, 1)];
Chris@42 87 TX = ii[WS(rs, 1)];
Chris@42 88 TT = W[0];
Chris@42 89 TW = W[1];
Chris@42 90 {
Chris@42 91 E TH, TK, TJ, T1q, TI, T1v, TV, TG;
Chris@42 92 TH = ri[WS(rs, 9)];
Chris@42 93 TK = ii[WS(rs, 9)];
Chris@42 94 T1v = TT * TX;
Chris@42 95 TV = TT * TU;
Chris@42 96 TG = W[16];
Chris@42 97 TJ = W[17];
Chris@42 98 T1w = FNMS(TW, TU, T1v);
Chris@42 99 TY = FMA(TW, TX, TV);
Chris@42 100 T1q = TG * TK;
Chris@42 101 TI = TG * TH;
Chris@42 102 TO = ri[WS(rs, 6)];
Chris@42 103 TR = ii[WS(rs, 6)];
Chris@42 104 T1r = FNMS(TJ, TH, T1q);
Chris@42 105 TL = FMA(TJ, TK, TI);
Chris@42 106 TN = W[10];
Chris@42 107 TQ = W[11];
Chris@42 108 }
Chris@42 109 }
Chris@42 110 T1s = T1p - T1r;
Chris@42 111 T1F = T1p + T1r;
Chris@42 112 TM = TF - TL;
Chris@42 113 T16 = TF + TL;
Chris@42 114 T1t = TN * TR;
Chris@42 115 TP = TN * TO;
Chris@42 116 T1u = FNMS(TQ, TO, T1t);
Chris@42 117 TS = FMA(TQ, TR, TP);
Chris@42 118 }
Chris@42 119 {
Chris@42 120 E T1e, Te, T1l, Tx, Tn, Tq, Tp, T1g, Tk, T1i, To;
Chris@42 121 {
Chris@42 122 E Tt, Tw, Tv, T1k, Tu;
Chris@42 123 {
Chris@42 124 E Ta, Td, T9, Tc, T1d, Tb, Ts;
Chris@42 125 Ta = ri[WS(rs, 2)];
Chris@42 126 Td = ii[WS(rs, 2)];
Chris@42 127 {
Chris@42 128 E T1G, T1x, TZ, T17;
Chris@42 129 T1G = T1u + T1w;
Chris@42 130 T1x = T1u - T1w;
Chris@42 131 TZ = TS - TY;
Chris@42 132 T17 = TS + TY;
Chris@42 133 T1y = T1s - T1x;
Chris@42 134 T25 = T1s + T1x;
Chris@42 135 T1P = T1F + T1G;
Chris@42 136 T1H = T1F - T1G;
Chris@42 137 T1Y = T16 - T17;
Chris@42 138 T18 = T16 + T17;
Chris@42 139 T10 = TM + TZ;
Chris@42 140 T2b = TM - TZ;
Chris@42 141 T9 = W[2];
Chris@42 142 }
Chris@42 143 Tc = W[3];
Chris@42 144 Tt = ri[WS(rs, 3)];
Chris@42 145 Tw = ii[WS(rs, 3)];
Chris@42 146 T1d = T9 * Td;
Chris@42 147 Tb = T9 * Ta;
Chris@42 148 Ts = W[4];
Chris@42 149 Tv = W[5];
Chris@42 150 T1e = FNMS(Tc, Ta, T1d);
Chris@42 151 Te = FMA(Tc, Td, Tb);
Chris@42 152 T1k = Ts * Tw;
Chris@42 153 Tu = Ts * Tt;
Chris@42 154 }
Chris@42 155 {
Chris@42 156 E Tg, Tj, Tf, Ti, T1f, Th, Tm;
Chris@42 157 Tg = ri[WS(rs, 7)];
Chris@42 158 Tj = ii[WS(rs, 7)];
Chris@42 159 T1l = FNMS(Tv, Tt, T1k);
Chris@42 160 Tx = FMA(Tv, Tw, Tu);
Chris@42 161 Tf = W[12];
Chris@42 162 Ti = W[13];
Chris@42 163 Tn = ri[WS(rs, 8)];
Chris@42 164 Tq = ii[WS(rs, 8)];
Chris@42 165 T1f = Tf * Tj;
Chris@42 166 Th = Tf * Tg;
Chris@42 167 Tm = W[14];
Chris@42 168 Tp = W[15];
Chris@42 169 T1g = FNMS(Ti, Tg, T1f);
Chris@42 170 Tk = FMA(Ti, Tj, Th);
Chris@42 171 T1i = Tm * Tq;
Chris@42 172 To = Tm * Tn;
Chris@42 173 }
Chris@42 174 }
Chris@42 175 {
Chris@42 176 E T1h, T1I, Tl, T13, T1j, Tr;
Chris@42 177 T1h = T1e - T1g;
Chris@42 178 T1I = T1e + T1g;
Chris@42 179 Tl = Te - Tk;
Chris@42 180 T13 = Te + Tk;
Chris@42 181 T1j = FNMS(Tp, Tn, T1i);
Chris@42 182 Tr = FMA(Tp, Tq, To);
Chris@42 183 {
Chris@42 184 E T1m, T1J, T14, Ty;
Chris@42 185 T1m = T1j - T1l;
Chris@42 186 T1J = T1j + T1l;
Chris@42 187 T14 = Tr + Tx;
Chris@42 188 Ty = Tr - Tx;
Chris@42 189 T1K = T1I - T1J;
Chris@42 190 T1O = T1I + T1J;
Chris@42 191 T15 = T13 + T14;
Chris@42 192 T1Z = T13 - T14;
Chris@42 193 T2a = Tl - Ty;
Chris@42 194 Tz = Tl + Ty;
Chris@42 195 T24 = T1h + T1m;
Chris@42 196 T1n = T1h - T1m;
Chris@42 197 }
Chris@42 198 }
Chris@42 199 }
Chris@42 200 }
Chris@42 201 }
Chris@42 202 {
Chris@42 203 E T2c, T2e, T29, T2d;
Chris@42 204 {
Chris@42 205 E T1b, T11, T26, T28, T27;
Chris@42 206 T1b = Tz - T10;
Chris@42 207 T11 = Tz + T10;
Chris@42 208 T26 = T24 + T25;
Chris@42 209 T28 = T24 - T25;
Chris@42 210 {
Chris@42 211 E T1B, T1z, T1a, T1A, T1c;
Chris@42 212 T1B = FNMS(KP618033988, T1n, T1y);
Chris@42 213 T1z = FMA(KP618033988, T1y, T1n);
Chris@42 214 ri[WS(rs, 5)] = T8 + T11;
Chris@42 215 T1a = FNMS(KP250000000, T11, T8);
Chris@42 216 T1A = FNMS(KP559016994, T1b, T1a);
Chris@42 217 T1c = FMA(KP559016994, T1b, T1a);
Chris@42 218 T27 = FNMS(KP250000000, T26, T23);
Chris@42 219 T2c = FMA(KP618033988, T2b, T2a);
Chris@42 220 T2e = FNMS(KP618033988, T2a, T2b);
Chris@42 221 ri[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
Chris@42 222 ri[WS(rs, 9)] = FNMS(KP951056516, T1z, T1c);
Chris@42 223 ri[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
Chris@42 224 ri[WS(rs, 7)] = FNMS(KP951056516, T1B, T1A);
Chris@42 225 }
Chris@42 226 ii[WS(rs, 5)] = T26 + T23;
Chris@42 227 T29 = FMA(KP559016994, T28, T27);
Chris@42 228 T2d = FNMS(KP559016994, T28, T27);
Chris@42 229 }
Chris@42 230 {
Chris@42 231 E T1E, T1M, T1L, T1N, T19, T1D, T1C, T1Q, T1W, T1V;
Chris@42 232 T19 = T15 + T18;
Chris@42 233 T1D = T15 - T18;
Chris@42 234 ii[WS(rs, 7)] = FMA(KP951056516, T2e, T2d);
Chris@42 235 ii[WS(rs, 3)] = FNMS(KP951056516, T2e, T2d);
Chris@42 236 ii[WS(rs, 9)] = FMA(KP951056516, T2c, T29);
Chris@42 237 ii[WS(rs, 1)] = FNMS(KP951056516, T2c, T29);
Chris@42 238 T1C = FNMS(KP250000000, T19, T12);
Chris@42 239 ri[0] = T12 + T19;
Chris@42 240 T1E = FNMS(KP559016994, T1D, T1C);
Chris@42 241 T1M = FMA(KP559016994, T1D, T1C);
Chris@42 242 T1L = FNMS(KP618033988, T1K, T1H);
Chris@42 243 T1N = FMA(KP618033988, T1H, T1K);
Chris@42 244 T1Q = T1O + T1P;
Chris@42 245 T1W = T1O - T1P;
Chris@42 246 ri[WS(rs, 6)] = FMA(KP951056516, T1N, T1M);
Chris@42 247 ri[WS(rs, 4)] = FNMS(KP951056516, T1N, T1M);
Chris@42 248 ri[WS(rs, 8)] = FMA(KP951056516, T1L, T1E);
Chris@42 249 ri[WS(rs, 2)] = FNMS(KP951056516, T1L, T1E);
Chris@42 250 T1V = FNMS(KP250000000, T1Q, T1U);
Chris@42 251 ii[0] = T1Q + T1U;
Chris@42 252 T1X = FNMS(KP559016994, T1W, T1V);
Chris@42 253 T21 = FMA(KP559016994, T1W, T1V);
Chris@42 254 T20 = FNMS(KP618033988, T1Z, T1Y);
Chris@42 255 T22 = FMA(KP618033988, T1Y, T1Z);
Chris@42 256 }
Chris@42 257 }
Chris@42 258 }
Chris@42 259 ii[WS(rs, 6)] = FNMS(KP951056516, T22, T21);
Chris@42 260 ii[WS(rs, 4)] = FMA(KP951056516, T22, T21);
Chris@42 261 ii[WS(rs, 8)] = FNMS(KP951056516, T20, T1X);
Chris@42 262 ii[WS(rs, 2)] = FMA(KP951056516, T20, T1X);
Chris@42 263 }
Chris@42 264 }
Chris@42 265 }
Chris@42 266
Chris@42 267 static const tw_instr twinstr[] = {
Chris@42 268 {TW_FULL, 0, 10},
Chris@42 269 {TW_NEXT, 1, 0}
Chris@42 270 };
Chris@42 271
Chris@42 272 static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, {48, 18, 54, 0}, 0, 0, 0 };
Chris@42 273
Chris@42 274 void X(codelet_t1_10) (planner *p) {
Chris@42 275 X(kdft_dit_register) (p, t1_10, &desc);
Chris@42 276 }
Chris@42 277 #else /* HAVE_FMA */
Chris@42 278
Chris@42 279 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include t.h */
Chris@42 280
Chris@42 281 /*
Chris@42 282 * This function contains 102 FP additions, 60 FP multiplications,
Chris@42 283 * (or, 72 additions, 30 multiplications, 30 fused multiply/add),
Chris@42 284 * 45 stack variables, 4 constants, and 40 memory accesses
Chris@42 285 */
Chris@42 286 #include "t.h"
Chris@42 287
Chris@42 288 static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 289 {
Chris@42 290 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 291 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 292 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 293 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 294 {
Chris@42 295 INT m;
Chris@42 296 for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
Chris@42 297 E T7, T1O, TT, T1C, TF, TQ, TR, T1o, T1p, T1y, TX, TY, TZ, T1d, T1g;
Chris@42 298 E T1M, Ti, Tt, Tu, T1r, T1s, T1x, TU, TV, TW, T16, T19, T1L;
Chris@42 299 {
Chris@42 300 E T1, T1B, T6, T1A;
Chris@42 301 T1 = ri[0];
Chris@42 302 T1B = ii[0];
Chris@42 303 {
Chris@42 304 E T3, T5, T2, T4;
Chris@42 305 T3 = ri[WS(rs, 5)];
Chris@42 306 T5 = ii[WS(rs, 5)];
Chris@42 307 T2 = W[8];
Chris@42 308 T4 = W[9];
Chris@42 309 T6 = FMA(T2, T3, T4 * T5);
Chris@42 310 T1A = FNMS(T4, T3, T2 * T5);
Chris@42 311 }
Chris@42 312 T7 = T1 - T6;
Chris@42 313 T1O = T1B - T1A;
Chris@42 314 TT = T1 + T6;
Chris@42 315 T1C = T1A + T1B;
Chris@42 316 }
Chris@42 317 {
Chris@42 318 E Tz, T1b, TP, T1f, TE, T1c, TK, T1e;
Chris@42 319 {
Chris@42 320 E Tw, Ty, Tv, Tx;
Chris@42 321 Tw = ri[WS(rs, 4)];
Chris@42 322 Ty = ii[WS(rs, 4)];
Chris@42 323 Tv = W[6];
Chris@42 324 Tx = W[7];
Chris@42 325 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@42 326 T1b = FNMS(Tx, Tw, Tv * Ty);
Chris@42 327 }
Chris@42 328 {
Chris@42 329 E TM, TO, TL, TN;
Chris@42 330 TM = ri[WS(rs, 1)];
Chris@42 331 TO = ii[WS(rs, 1)];
Chris@42 332 TL = W[0];
Chris@42 333 TN = W[1];
Chris@42 334 TP = FMA(TL, TM, TN * TO);
Chris@42 335 T1f = FNMS(TN, TM, TL * TO);
Chris@42 336 }
Chris@42 337 {
Chris@42 338 E TB, TD, TA, TC;
Chris@42 339 TB = ri[WS(rs, 9)];
Chris@42 340 TD = ii[WS(rs, 9)];
Chris@42 341 TA = W[16];
Chris@42 342 TC = W[17];
Chris@42 343 TE = FMA(TA, TB, TC * TD);
Chris@42 344 T1c = FNMS(TC, TB, TA * TD);
Chris@42 345 }
Chris@42 346 {
Chris@42 347 E TH, TJ, TG, TI;
Chris@42 348 TH = ri[WS(rs, 6)];
Chris@42 349 TJ = ii[WS(rs, 6)];
Chris@42 350 TG = W[10];
Chris@42 351 TI = W[11];
Chris@42 352 TK = FMA(TG, TH, TI * TJ);
Chris@42 353 T1e = FNMS(TI, TH, TG * TJ);
Chris@42 354 }
Chris@42 355 TF = Tz - TE;
Chris@42 356 TQ = TK - TP;
Chris@42 357 TR = TF + TQ;
Chris@42 358 T1o = T1b + T1c;
Chris@42 359 T1p = T1e + T1f;
Chris@42 360 T1y = T1o + T1p;
Chris@42 361 TX = Tz + TE;
Chris@42 362 TY = TK + TP;
Chris@42 363 TZ = TX + TY;
Chris@42 364 T1d = T1b - T1c;
Chris@42 365 T1g = T1e - T1f;
Chris@42 366 T1M = T1d + T1g;
Chris@42 367 }
Chris@42 368 {
Chris@42 369 E Tc, T14, Ts, T18, Th, T15, Tn, T17;
Chris@42 370 {
Chris@42 371 E T9, Tb, T8, Ta;
Chris@42 372 T9 = ri[WS(rs, 2)];
Chris@42 373 Tb = ii[WS(rs, 2)];
Chris@42 374 T8 = W[2];
Chris@42 375 Ta = W[3];
Chris@42 376 Tc = FMA(T8, T9, Ta * Tb);
Chris@42 377 T14 = FNMS(Ta, T9, T8 * Tb);
Chris@42 378 }
Chris@42 379 {
Chris@42 380 E Tp, Tr, To, Tq;
Chris@42 381 Tp = ri[WS(rs, 3)];
Chris@42 382 Tr = ii[WS(rs, 3)];
Chris@42 383 To = W[4];
Chris@42 384 Tq = W[5];
Chris@42 385 Ts = FMA(To, Tp, Tq * Tr);
Chris@42 386 T18 = FNMS(Tq, Tp, To * Tr);
Chris@42 387 }
Chris@42 388 {
Chris@42 389 E Te, Tg, Td, Tf;
Chris@42 390 Te = ri[WS(rs, 7)];
Chris@42 391 Tg = ii[WS(rs, 7)];
Chris@42 392 Td = W[12];
Chris@42 393 Tf = W[13];
Chris@42 394 Th = FMA(Td, Te, Tf * Tg);
Chris@42 395 T15 = FNMS(Tf, Te, Td * Tg);
Chris@42 396 }
Chris@42 397 {
Chris@42 398 E Tk, Tm, Tj, Tl;
Chris@42 399 Tk = ri[WS(rs, 8)];
Chris@42 400 Tm = ii[WS(rs, 8)];
Chris@42 401 Tj = W[14];
Chris@42 402 Tl = W[15];
Chris@42 403 Tn = FMA(Tj, Tk, Tl * Tm);
Chris@42 404 T17 = FNMS(Tl, Tk, Tj * Tm);
Chris@42 405 }
Chris@42 406 Ti = Tc - Th;
Chris@42 407 Tt = Tn - Ts;
Chris@42 408 Tu = Ti + Tt;
Chris@42 409 T1r = T14 + T15;
Chris@42 410 T1s = T17 + T18;
Chris@42 411 T1x = T1r + T1s;
Chris@42 412 TU = Tc + Th;
Chris@42 413 TV = Tn + Ts;
Chris@42 414 TW = TU + TV;
Chris@42 415 T16 = T14 - T15;
Chris@42 416 T19 = T17 - T18;
Chris@42 417 T1L = T16 + T19;
Chris@42 418 }
Chris@42 419 {
Chris@42 420 E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13;
Chris@42 421 T11 = KP559016994 * (Tu - TR);
Chris@42 422 TS = Tu + TR;
Chris@42 423 T12 = FNMS(KP250000000, TS, T7);
Chris@42 424 T1a = T16 - T19;
Chris@42 425 T1h = T1d - T1g;
Chris@42 426 T1i = FMA(KP951056516, T1a, KP587785252 * T1h);
Chris@42 427 T1k = FNMS(KP587785252, T1a, KP951056516 * T1h);
Chris@42 428 ri[WS(rs, 5)] = T7 + TS;
Chris@42 429 T1j = T12 - T11;
Chris@42 430 ri[WS(rs, 7)] = T1j - T1k;
Chris@42 431 ri[WS(rs, 3)] = T1j + T1k;
Chris@42 432 T13 = T11 + T12;
Chris@42 433 ri[WS(rs, 9)] = T13 - T1i;
Chris@42 434 ri[WS(rs, 1)] = T13 + T1i;
Chris@42 435 }
Chris@42 436 {
Chris@42 437 E T1N, T1P, T1Q, T1U, T1W, T1S, T1T, T1V, T1R;
Chris@42 438 T1N = KP559016994 * (T1L - T1M);
Chris@42 439 T1P = T1L + T1M;
Chris@42 440 T1Q = FNMS(KP250000000, T1P, T1O);
Chris@42 441 T1S = Ti - Tt;
Chris@42 442 T1T = TF - TQ;
Chris@42 443 T1U = FMA(KP951056516, T1S, KP587785252 * T1T);
Chris@42 444 T1W = FNMS(KP587785252, T1S, KP951056516 * T1T);
Chris@42 445 ii[WS(rs, 5)] = T1P + T1O;
Chris@42 446 T1V = T1Q - T1N;
Chris@42 447 ii[WS(rs, 3)] = T1V - T1W;
Chris@42 448 ii[WS(rs, 7)] = T1W + T1V;
Chris@42 449 T1R = T1N + T1Q;
Chris@42 450 ii[WS(rs, 1)] = T1R - T1U;
Chris@42 451 ii[WS(rs, 9)] = T1U + T1R;
Chris@42 452 }
Chris@42 453 {
Chris@42 454 E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n;
Chris@42 455 T1m = KP559016994 * (TW - TZ);
Chris@42 456 T10 = TW + TZ;
Chris@42 457 T1l = FNMS(KP250000000, T10, TT);
Chris@42 458 T1q = T1o - T1p;
Chris@42 459 T1t = T1r - T1s;
Chris@42 460 T1u = FNMS(KP587785252, T1t, KP951056516 * T1q);
Chris@42 461 T1w = FMA(KP951056516, T1t, KP587785252 * T1q);
Chris@42 462 ri[0] = TT + T10;
Chris@42 463 T1v = T1m + T1l;
Chris@42 464 ri[WS(rs, 4)] = T1v - T1w;
Chris@42 465 ri[WS(rs, 6)] = T1v + T1w;
Chris@42 466 T1n = T1l - T1m;
Chris@42 467 ri[WS(rs, 2)] = T1n - T1u;
Chris@42 468 ri[WS(rs, 8)] = T1n + T1u;
Chris@42 469 }
Chris@42 470 {
Chris@42 471 E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
Chris@42 472 T1H = KP559016994 * (T1x - T1y);
Chris@42 473 T1z = T1x + T1y;
Chris@42 474 T1G = FNMS(KP250000000, T1z, T1C);
Chris@42 475 T1D = TX - TY;
Chris@42 476 T1E = TU - TV;
Chris@42 477 T1F = FNMS(KP587785252, T1E, KP951056516 * T1D);
Chris@42 478 T1J = FMA(KP951056516, T1E, KP587785252 * T1D);
Chris@42 479 ii[0] = T1z + T1C;
Chris@42 480 T1K = T1H + T1G;
Chris@42 481 ii[WS(rs, 4)] = T1J + T1K;
Chris@42 482 ii[WS(rs, 6)] = T1K - T1J;
Chris@42 483 T1I = T1G - T1H;
Chris@42 484 ii[WS(rs, 2)] = T1F + T1I;
Chris@42 485 ii[WS(rs, 8)] = T1I - T1F;
Chris@42 486 }
Chris@42 487 }
Chris@42 488 }
Chris@42 489 }
Chris@42 490
Chris@42 491 static const tw_instr twinstr[] = {
Chris@42 492 {TW_FULL, 0, 10},
Chris@42 493 {TW_NEXT, 1, 0}
Chris@42 494 };
Chris@42 495
Chris@42 496 static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, {72, 30, 30, 0}, 0, 0, 0 };
Chris@42 497
Chris@42 498 void X(codelet_t1_10) (planner *p) {
Chris@42 499 X(kdft_dit_register) (p, t1_10, &desc);
Chris@42 500 }
Chris@42 501 #endif /* HAVE_FMA */